# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ import json # isort: skip import asyncio from gettext import gettext as _ from pathlib import Path from typing import Any, Final, TypedDict from unittest.mock import AsyncMock, MagicMock, call, patch from urllib.error import URLError import pytest from jsonschema import Draft202012Validator from ruamel.yaml import YAML import kleinanzeigen_bot.extract as extract_module from kleinanzeigen_bot.model.ad_model import AdPartial, ContactPartial from kleinanzeigen_bot.model.config_model import Config, DownloadConfig from kleinanzeigen_bot.utils.web_scraping_mixin import Browser, By, Element SCHEMA_PATH:Final[Path] = Path(__file__).resolve().parents[2] / "schemas" / "ad.schema.json" def _read_text_file(path:Path) -> str: return path.read_text(encoding = "utf-8") class _DimensionsDict(TypedDict): ad_attributes:str class _UniversalAnalyticsOptsDict(TypedDict): dimensions:_DimensionsDict class _BelenConfDict(TypedDict): universalAnalyticsOpts:_UniversalAnalyticsOptsDict class _SpecialAttributesDict(TypedDict, total=False): art_s:str condition_s:str class _TestCaseDict(TypedDict): # noqa: PYI049 Private TypedDict `...` is never used belen_conf:_BelenConfDict expected:_SpecialAttributesDict @pytest.fixture def test_extractor(browser_mock:MagicMock, test_bot_config:Config) -> extract_module.AdExtractor: """Provides a fresh extract_module.AdExtractor instance for testing. Dependencies: - browser_mock: Used to mock browser interactions - test_bot_config: Used to initialize the extractor with a valid configuration """ return extract_module.AdExtractor(browser_mock, test_bot_config, Path("downloaded-ads")) class TestAdExtractorBasics: """Basic synchronous tests for extract_module.AdExtractor.""" def test_constructor(self, browser_mock:MagicMock, test_bot_config:Config) -> None: """Test the constructor of extract_module.AdExtractor""" extractor = extract_module.AdExtractor(browser_mock, test_bot_config, Path("downloaded-ads")) assert extractor.browser == browser_mock assert extractor.config == test_bot_config assert extractor.download_dir == Path("downloaded-ads") @pytest.mark.parametrize( ("url", "expected_id"), [ ("https://www.kleinanzeigen.de/s-anzeige/test-title/12345678", 12345678), ("https://www.kleinanzeigen.de/s-anzeige/another-test/98765432", 98765432), ("https://www.kleinanzeigen.de/s-anzeige/invalid-id/abc", -1), ("https://www.kleinanzeigen.de/invalid-url", -1), ], ) def test_extract_ad_id_from_ad_url(self, test_extractor:extract_module.AdExtractor, url:str, expected_id:int) -> None: """Test extraction of ad ID from different URL formats.""" assert test_extractor.extract_ad_id_from_ad_url(url) == expected_id @pytest.mark.asyncio async def test_path_exists_helper(self, tmp_path:Path) -> None: """Test files.exists helper function.""" from kleinanzeigen_bot.utils import files # noqa: PLC0415 # Test with existing path existing_file = tmp_path / "test.txt" existing_file.write_text("test") assert await files.exists(existing_file) is True assert await files.exists(str(existing_file)) is True # Test with non-existing path non_existing = tmp_path / "nonexistent.txt" assert await files.exists(non_existing) is False assert await files.exists(str(non_existing)) is False @pytest.mark.asyncio async def test_path_is_dir_helper(self, tmp_path:Path) -> None: """Test files.is_dir helper function.""" from kleinanzeigen_bot.utils import files # noqa: PLC0415 # Test with directory test_dir = tmp_path / "testdir" test_dir.mkdir() assert await files.is_dir(test_dir) is True assert await files.is_dir(str(test_dir)) is True # Test with file test_file = tmp_path / "test.txt" test_file.write_text("test") assert await files.is_dir(test_file) is False assert await files.is_dir(str(test_file)) is False # Test with non-existing path non_existing = tmp_path / "nonexistent" assert await files.is_dir(non_existing) is False assert await files.is_dir(str(non_existing)) is False @pytest.mark.asyncio async def test_exists_async_helper(self, tmp_path:Path) -> None: """Test files.exists async helper function.""" from kleinanzeigen_bot.utils import files # noqa: PLC0415 # Test with existing path existing_file = tmp_path / "test.txt" existing_file.write_text("test") assert await files.exists(existing_file) is True assert await files.exists(str(existing_file)) is True # Test with non-existing path non_existing = tmp_path / "nonexistent.txt" assert await files.exists(non_existing) is False assert await files.exists(str(non_existing)) is False @pytest.mark.asyncio async def test_isdir_async_helper(self, tmp_path:Path) -> None: """Test files.is_dir async helper function.""" from kleinanzeigen_bot.utils import files # noqa: PLC0415 # Test with directory test_dir = tmp_path / "testdir" test_dir.mkdir() assert await files.is_dir(test_dir) is True assert await files.is_dir(str(test_dir)) is True # Test with file test_file = tmp_path / "test.txt" test_file.write_text("test") assert await files.is_dir(test_file) is False assert await files.is_dir(str(test_file)) is False # Test with non-existing path non_existing = tmp_path / "nonexistent" assert await files.is_dir(non_existing) is False assert await files.is_dir(str(non_existing)) is False def test_download_and_save_image_sync_success(self, tmp_path:Path) -> None: """Test _download_and_save_image_sync with successful download.""" from unittest.mock import MagicMock, mock_open # noqa: PLC0415 test_dir = tmp_path / "images" test_dir.mkdir() # Mock urllib response mock_response = MagicMock() mock_response.info().get_content_type.return_value = "image/jpeg" mock_response.__enter__ = MagicMock(return_value = mock_response) mock_response.__exit__ = MagicMock(return_value = False) with ( patch("kleinanzeigen_bot.extract.urllib_request.urlopen", return_value = mock_response), patch("kleinanzeigen_bot.extract.open", mock_open()), patch("kleinanzeigen_bot.extract.shutil.copyfileobj"), ): result = extract_module.AdExtractor._download_and_save_image_sync("http://example.com/image.jpg", str(test_dir), "test_", 1) assert result is not None assert result.endswith((".jpe", ".jpeg", ".jpg")) assert "test_1" in result def test_download_and_save_image_sync_failure(self, tmp_path:Path) -> None: """Test _download_and_save_image_sync with download failure.""" with patch("kleinanzeigen_bot.extract.urllib_request.urlopen", side_effect = URLError("Network error")): result = extract_module.AdExtractor._download_and_save_image_sync("http://example.com/image.jpg", str(tmp_path), "test_", 1) assert result is None class TestAdExtractorPricing: """Tests for pricing related functionality.""" @pytest.mark.parametrize( ("price_text", "expected_price", "expected_type"), [ ("50 €", 50, "FIXED"), ("1.234 €", 1234, "FIXED"), ("50 € VB", 50, "NEGOTIABLE"), ("VB", None, "NEGOTIABLE"), ("Zu verschenken", None, "GIVE_AWAY"), ], ) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_pricing_info( self, test_extractor:extract_module.AdExtractor, price_text:str, expected_price:int | None, expected_type:str ) -> None: """Test price extraction with different formats""" with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = price_text): price, price_type = await test_extractor._extract_pricing_info_from_ad_page() assert price == expected_price assert price_type == expected_type @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_pricing_info_timeout(self, test_extractor:extract_module.AdExtractor) -> None: """Test price extraction when element is not found""" with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError): price, price_type = await test_extractor._extract_pricing_info_from_ad_page() assert price is None assert price_type == "NOT_APPLICABLE" class TestAdExtractorShipping: """Tests for shipping related functionality.""" @pytest.mark.parametrize( ("shipping_text", "expected_type", "expected_cost"), [ ("+ Versand ab 2,99 €", "SHIPPING", 2.99), ("Nur Abholung", "PICKUP", None), ("Versand möglich", "SHIPPING", None), ], ) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info( self, test_extractor:extract_module.AdExtractor, shipping_text:str, expected_type:str, expected_cost:float | None ) -> None: """Test shipping info extraction with different text formats.""" with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = shipping_text), patch.object(test_extractor, "web_request", new_callable = AsyncMock) as mock_web_request, ): if expected_cost: shipping_response:dict[str, Any] = { "data": {"shippingOptionsResponse": {"options": [{"id": "DHL_001", "priceInEuroCent": int(expected_cost * 100), "packageSize": "SMALL"}]}} } mock_web_request.return_value = {"content": json.dumps(shipping_response)} shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == expected_type assert costs == expected_cost if expected_cost: assert options == ["DHL_2"] else: assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_options(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction with shipping options.""" shipping_response = { "content": json.dumps({"data": {"shippingOptionsResponse": {"options": [{"id": "DHL_001", "priceInEuroCent": 549, "packageSize": "SMALL"}]}}}) } with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 5,49 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 5.49 assert options == ["DHL_2"] @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_all_matching_options(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction with all matching options enabled.""" shipping_response = { "content": json.dumps( { "data": { "shippingOptionsResponse": { "options": [ {"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"}, {"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"}, {"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}, ] } } } ) } # Enable all matching options in config test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True}) with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 4.89 if options is not None: assert sorted(options) == ["DHL_2", "Hermes_Päckchen", "Hermes_S"] else: assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_all_matching_options_no_match(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping extraction when include-all is enabled but no option matches the price.""" shipping_response = { "content": json.dumps( { "data": { "shippingOptionsResponse": { "options": [ {"id": "DHL_001", "priceInEuroCent": 500, "packageSize": "SMALL"}, {"id": "HERMES_001", "priceInEuroCent": 600, "packageSize": "SMALL"}, ] } } } ) } test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True}) with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 4.89 assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_excluded_options(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction with excluded options.""" shipping_response = { "content": json.dumps( { "data": { "shippingOptionsResponse": { "options": [ {"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"}, {"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"}, {"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}, ] } } } ) } # Enable all matching options and exclude DHL in config test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True, "excluded_shipping_options": ["DHL_2"]}) with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 4.89 if options is not None: assert sorted(options) == ["Hermes_Päckchen", "Hermes_S"] else: assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_excluded_matching_option(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction when the matching option is excluded.""" shipping_response = { "content": json.dumps( { "data": { "shippingOptionsResponse": { "options": [ {"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"}, {"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"}, ] } } } ) } # Exclude the matching option test_extractor.config.download = DownloadConfig.model_validate({"excluded_shipping_options": ["Hermes_Päckchen"]}) with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 4.89 assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_with_no_matching_option(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction when price exists but NO matching option in API response.""" shipping_response = { "content": json.dumps( { "data": { "shippingOptionsResponse": { "options": [ {"id": "DHL_001", "priceInEuroCent": 500, "packageSize": "SMALL"}, {"id": "HERMES_001", "priceInEuroCent": 600, "packageSize": "SMALL"}, ] } } } ) } with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 7,00 €"), patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "SHIPPING" assert costs == 7.0 assert options is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_shipping_info_timeout(self, test_extractor:extract_module.AdExtractor) -> None: """Test shipping info extraction when shipping element is missing (TimeoutError).""" with ( patch.object(test_extractor, "page", MagicMock()), patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError), ): shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page() assert shipping_type == "NOT_APPLICABLE" assert costs is None assert options is None class TestAdExtractorNavigation: """Tests for navigation related functionality.""" @pytest.mark.asyncio async def test_navigate_to_ad_page_with_url(self, test_extractor:extract_module.AdExtractor) -> None: """Test navigation to ad page using a URL.""" page_mock = AsyncMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" with ( patch.object(test_extractor, "page", page_mock), patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError), ): result = await test_extractor.navigate_to_ad_page("https://www.kleinanzeigen.de/s-anzeige/test/12345") assert result is True mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-anzeige/test/12345") @pytest.mark.asyncio async def test_navigate_to_ad_page_with_id(self, test_extractor:extract_module.AdExtractor) -> None: """Test navigation to ad page using an ID.""" ad_id = 12345 page_mock = AsyncMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/{0}".format(ad_id) popup_close_mock = AsyncMock() popup_close_mock.click = AsyncMock() popup_close_mock.apply = AsyncMock(return_value = True) def find_mock(selector_type:By, selector_value:str, **_:Any) -> Element | None: if selector_type == By.CLASS_NAME and selector_value == "mfp-close": return popup_close_mock return None with ( patch.object(test_extractor, "page", page_mock), patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = find_mock), ): result = await test_extractor.navigate_to_ad_page(ad_id) assert result is True mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={0}".format(ad_id)) popup_close_mock.click.assert_awaited_once() @pytest.mark.asyncio async def test_navigate_to_ad_page_with_popup(self, test_extractor:extract_module.AdExtractor) -> None: """Test navigation to ad page with popup handling.""" page_mock = AsyncMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" input_mock = AsyncMock() input_mock.clear_input = AsyncMock() input_mock.send_keys = AsyncMock() input_mock.apply = AsyncMock(return_value = True) with ( patch.object(test_extractor, "page", page_mock), patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock), patch.object(test_extractor, "web_click", new_callable = AsyncMock) as mock_web_click, patch.object(test_extractor, "web_check", new_callable = AsyncMock, return_value = True), ): result = await test_extractor.navigate_to_ad_page(12345) assert result is True mock_web_click.assert_called_with(By.CLASS_NAME, "mfp-close") @pytest.mark.asyncio async def test_navigate_to_ad_page_invalid_id(self, test_extractor:extract_module.AdExtractor) -> None: """Test navigation to ad page with invalid ID.""" page_mock = AsyncMock() page_mock.url = "https://www.kleinanzeigen.de/s-suchen.html?k0" input_mock = AsyncMock() input_mock.clear_input = AsyncMock() input_mock.send_keys = AsyncMock() input_mock.apply = AsyncMock(return_value = True) input_mock.attrs = {} with ( patch.object(test_extractor, "page", page_mock), patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock), ): result = await test_extractor.navigate_to_ad_page(99999) assert result is False @pytest.mark.asyncio async def test_extract_own_ads_urls(self, test_extractor:extract_module.AdExtractor) -> None: """Test extraction of own ads URLs - basic test.""" with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find, patch.object(test_extractor, "web_find_all", new_callable = AsyncMock) as mock_web_find_all, patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), patch.object(test_extractor, "web_execute", new_callable = AsyncMock), ): # --- Setup mock objects for DOM elements --- # Mocks needed for the actual execution flow ad_list_container_mock = MagicMock() pagination_section_mock = MagicMock() cardbox_mock = MagicMock() # Represents the
  • element link_mock = MagicMock() # Represents the element link_mock.attrs = {"href": "/s-anzeige/test/12345"} # Configure the desired output # Mocks for elements potentially checked but maybe not strictly needed for output # (depending on how robust the mocking is) # next_button_mock = MagicMock() # If needed for multi_page logic # --- Setup mock responses for web_find and web_find_all in CORRECT ORDER --- # 1. Initial find for ad list container (before loop) # 2. Find for pagination section (pagination check) # 3. Find for ad list container (inside loop) # 4. Find for the link (inside list comprehension) mock_web_find.side_effect = [ ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop) pagination_section_mock, # Call 2: find .Pagination ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop) link_mock, # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface' # Add more mocks here if the pagination navigation logic calls web_find again ] # 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case # 2. Find all '.cardbox' elements (inside loop) mock_web_find_all.side_effect = [ [], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page [cardbox_mock], # Call 2: find .cardbox -> One ad item # Add more mocks here if pagination navigation calls web_find_all ] # --- Execute test and verify results --- refs = await test_extractor.extract_own_ads_urls() # --- Assertions --- assert refs == ["/s-anzeige/test/12345"] # Now it should match # Optional: Verify calls were made as expected mock_web_find.assert_has_calls( [ call(By.ID, "my-manageitems-adlist"), call(By.CSS_SELECTOR, ".Pagination", timeout = 10), call(By.ID, "my-manageitems-adlist"), call(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = cardbox_mock), ], any_order = False, ) # Check order if important mock_web_find_all.assert_has_calls( [ call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section_mock), call(By.CLASS_NAME, "cardbox", parent = ad_list_container_mock), ], any_order = False, ) @pytest.mark.asyncio async def test_extract_own_ads_urls_paginates_with_enabled_next_button(self, test_extractor:extract_module.AdExtractor) -> None: """Ensure the paginator clicks the first enabled next button and advances.""" ad_list_container_mock = MagicMock() pagination_section_mock = MagicMock() cardbox_page_one = MagicMock() cardbox_page_two = MagicMock() link_page_one = MagicMock(attrs = {"href": "/s-anzeige/page-one/111"}) link_page_two = MagicMock(attrs = {"href": "/s-anzeige/page-two/222"}) next_button_enabled = AsyncMock() next_button_enabled.attrs = {} disabled_button = MagicMock() disabled_button.attrs = {"disabled": True} link_queue = [link_page_one, link_page_two] next_button_call = {"count": 0} cardbox_call = {"count": 0} async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element: if selector_type == By.ID and selector_value == "my-manageitems-adlist": return ad_list_container_mock if selector_type == By.CSS_SELECTOR and selector_value == ".Pagination": return pagination_section_mock if selector_type == By.CSS_SELECTOR and selector_value == "div h3 a.text-onSurface": return link_queue.pop(0) raise AssertionError(f"Unexpected selector {selector_type} {selector_value}") async def fake_web_find_all( selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None ) -> list[Element]: if selector_type == By.CSS_SELECTOR and selector_value == 'button[aria-label="Nächste"]': next_button_call["count"] += 1 if next_button_call["count"] == 1: return [next_button_enabled] # initial detection -> multi page if next_button_call["count"] == 2: return [disabled_button, next_button_enabled] # navigation on page 1 return [] # after navigating, stop if selector_type == By.CLASS_NAME and selector_value == "cardbox": cardbox_call["count"] += 1 return [cardbox_page_one] if cardbox_call["count"] == 1 else [cardbox_page_two] raise AssertionError(f"Unexpected find_all selector {selector_type} {selector_value}") with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find), patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = fake_web_find_all), ): refs = await test_extractor.extract_own_ads_urls() assert refs == ["/s-anzeige/page-one/111", "/s-anzeige/page-two/222"] next_button_enabled.click.assert_awaited() # triggered once during navigation @pytest.mark.asyncio async def test_extract_own_ads_urls_timeout_in_callback(self, test_extractor:extract_module.AdExtractor) -> None: """Test that TimeoutError in extract_page_refs callback stops pagination.""" with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find, patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = []), patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), patch.object(test_extractor, "web_execute", new_callable = AsyncMock), ): # Setup: ad list container exists, but web_find_all for cardbox raises TimeoutError ad_list_container_mock = MagicMock() call_count = {"count": 0} def mock_find_side_effect(*args:Any, **kwargs:Any) -> Element: call_count["count"] += 1 if call_count["count"] == 1: # First call: ad list container (before pagination loop) return ad_list_container_mock # Second call: ad list container (inside callback) return ad_list_container_mock mock_web_find.side_effect = mock_find_side_effect # Make web_find_all for cardbox raise TimeoutError (simulating missing ad items) async def mock_find_all_side_effect(*args:Any, **kwargs:Any) -> list[Element]: raise TimeoutError("Ad items not found") with patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = mock_find_all_side_effect): refs = await test_extractor.extract_own_ads_urls() # Pagination should stop (TimeoutError in callback returns True) assert refs == [] @pytest.mark.asyncio async def test_extract_own_ads_urls_skips_single_item_timeout(self, test_extractor:extract_module.AdExtractor) -> None: """Timeout on one ad item should skip that item but keep extracting others.""" ad_list_container_mock = MagicMock() first_item = MagicMock() second_item = MagicMock() valid_link = MagicMock() valid_link.attrs = {"href": "/s-anzeige/ok/999"} with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]), patch.object( test_extractor, "web_find", new_callable = AsyncMock, side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, TimeoutError(), valid_link], ), ): refs = await test_extractor.extract_own_ads_urls() assert refs == ["/s-anzeige/ok/999"] @pytest.mark.asyncio async def test_extract_own_ads_urls_skips_single_item_without_href(self, test_extractor:extract_module.AdExtractor) -> None: """Anchor without href should be skipped instead of adding a 'None' entry.""" ad_list_container_mock = MagicMock() first_item = MagicMock() second_item = MagicMock() missing_href_link = MagicMock() missing_href_link.attrs = {} valid_link = MagicMock() valid_link.attrs = {"href": "/s-anzeige/ok/999"} with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_item, second_item]), patch.object( test_extractor, "web_find", new_callable = AsyncMock, side_effect = [ad_list_container_mock, TimeoutError(), ad_list_container_mock, missing_href_link, valid_link], ), ): refs = await test_extractor.extract_own_ads_urls() assert refs == ["/s-anzeige/ok/999"] @pytest.mark.asyncio async def test_extract_own_ads_urls_generic_exception_in_callback(self, test_extractor:extract_module.AdExtractor) -> None: """Test that generic Exception in extract_page_refs callback continues pagination.""" with ( patch.object(test_extractor, "web_open", new_callable = AsyncMock), patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find, patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), ): # Setup: ad list container exists, but web_find_all raises generic Exception ad_list_container_mock = MagicMock() call_count = {"count": 0} def mock_find_side_effect(*args:Any, **kwargs:Any) -> Element: call_count["count"] += 1 if call_count["count"] == 1: # First call: ad list container (before pagination loop) return ad_list_container_mock # Second call: pagination check - raise TimeoutError to indicate no pagination if call_count["count"] == 2: raise TimeoutError("No pagination") # Third call: ad list container (inside callback) return ad_list_container_mock mock_web_find.side_effect = mock_find_side_effect # Make web_find_all raise a generic exception async def mock_find_all_side_effect(*args:Any, **kwargs:Any) -> list[Element]: raise AttributeError("Unexpected error") with patch.object(test_extractor, "web_find_all", new_callable = AsyncMock, side_effect = mock_find_all_side_effect): refs = await test_extractor.extract_own_ads_urls() # Pagination should continue despite exception (callback returns False) # Since it's a single page (no pagination), refs should be empty assert refs == [] class TestAdExtractorContent: """Tests for content extraction functionality.""" # pylint: disable=protected-access @pytest.mark.asyncio async def test_extract_description_with_affixes( self, test_extractor:extract_module.AdExtractor, description_test_cases:list[tuple[dict[str, Any], str, str]], test_bot_config:Config ) -> None: """Test extraction of description with various prefix/suffix configurations.""" # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" test_extractor.page = page_mock for config, raw_description, _expected_description in description_test_cases: test_extractor.config = test_bot_config.with_values(config) with patch.multiple( test_extractor, web_text = AsyncMock( side_effect = [ "Test Title", # Title raw_description, # Raw description (without affixes) "03.02.2025", # Creation date ] ), web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}), _extract_category_from_ad_page = AsyncMock(return_value = "160"), _extract_special_attributes_from_ad_page = AsyncMock(return_value = {}), _extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")), _extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)), _extract_sell_directly_from_ad_page = AsyncMock(return_value = False), _download_images_from_ad_page = AsyncMock(return_value = []), _extract_contact_from_ad_page = AsyncMock(return_value = {}), ): info = await test_extractor._extract_ad_page_info("/some/dir", 12345) assert info.description == raw_description @pytest.mark.asyncio async def test_extract_description_with_affixes_timeout(self, test_extractor:extract_module.AdExtractor) -> None: """Test handling of timeout when extracting description.""" # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" test_extractor.page = page_mock with patch.multiple( test_extractor, web_text = AsyncMock( side_effect = [ "Test Title", # Title succeeds TimeoutError("Timeout"), # Description times out "03.02.2025", # Date succeeds ] ), web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}), _extract_category_from_ad_page = AsyncMock(return_value = "160"), _extract_special_attributes_from_ad_page = AsyncMock(return_value = {}), _extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")), _extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)), _extract_sell_directly_from_ad_page = AsyncMock(return_value = False), _download_images_from_ad_page = AsyncMock(return_value = []), _extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial()), ): try: info = await test_extractor._extract_ad_page_info("/some/dir", 12345) assert not info.description except TimeoutError: # This is also acceptable - depends on how we want to handle timeouts pass @pytest.mark.asyncio async def test_extract_description_with_affixes_no_affixes(self, test_extractor:extract_module.AdExtractor) -> None: """Test extraction of description without any affixes in config.""" # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" test_extractor.page = page_mock raw_description = "Original Description" with patch.multiple( test_extractor, web_text = AsyncMock( side_effect = [ "Test Title", # Title raw_description, # Description without affixes "03.02.2025", # Creation date ] ), web_execute = AsyncMock(return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}), _extract_category_from_ad_page = AsyncMock(return_value = "160"), _extract_special_attributes_from_ad_page = AsyncMock(return_value = {}), _extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")), _extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)), _extract_sell_directly_from_ad_page = AsyncMock(return_value = False), _download_images_from_ad_page = AsyncMock(return_value = []), _extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial()), ): info = await test_extractor._extract_ad_page_info("/some/dir", 12345) assert info.description == raw_description @pytest.mark.asyncio async def test_extract_sell_directly_data_hit_true(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction with data hit - buyNowEligible=True.""" # Setup extractor with published ads data test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": True}} # Setup page URL test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is True @pytest.mark.asyncio async def test_extract_sell_directly_data_hit_false(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction with data hit - buyNowEligible=False.""" test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": False}} test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is False @pytest.mark.asyncio async def test_extract_sell_directly_data_miss(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction with data miss - ad ID not in cache returns None.""" # Cache has a different ad ID than the one in the URL - true data miss test_extractor.published_ads_by_id = {987654321: {"id": 987654321, "buyNowEligible": True}} test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None @pytest.mark.asyncio async def test_extract_sell_directly_empty_published_ads(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction with empty published_ads_by_id - returns None.""" test_extractor.published_ads_by_id = {} test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None @pytest.mark.asyncio async def test_extract_sell_directly_invalid_url(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction with invalid URL - returns None.""" test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": True}} test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/invalid-url" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None @pytest.mark.asyncio async def test_extract_sell_directly_non_boolean_value(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction when buyNowEligible is not a boolean.""" test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": "true"}} # String, not bool test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None @pytest.mark.asyncio async def test_extract_sell_directly_missing_buy_now_field(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction when buyNowEligible field is missing.""" test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "state": "active"}} # No buyNowEligible test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None @pytest.mark.asyncio async def test_extract_sell_directly_integer_value(self, test_extractor:extract_module.AdExtractor) -> None: """Test sell_directly extraction when buyNowEligible is an integer (not bool).""" test_extractor.published_ads_by_id = {123456789: {"id": 123456789, "buyNowEligible": 1}} # Integer, not bool test_extractor.page = MagicMock() test_extractor.page.url = "https://www.kleinanzeigen.de/s-anzeige/test-ad/123456789" result = await test_extractor._extract_sell_directly_from_ad_page() assert result is None class TestAdExtractorCategory: """Tests for category extraction functionality.""" @pytest.fixture def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor: browser_mock = MagicMock(spec = Browser) config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}}) return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads")) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_category(self, extractor:extract_module.AdExtractor) -> None: """Test category extraction from breadcrumb.""" category_line = MagicMock() first_part = MagicMock() first_part.attrs = {"href": "/s-familie-kind-baby/c17"} second_part = MagicMock() second_part.attrs = {"href": "/s-spielzeug/c23"} with ( patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part, second_part]) as mock_web_find_all, ): result = await extractor._extract_category_from_ad_page() assert result == "17/23" mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb") mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_category_single_identifier(self, extractor:extract_module.AdExtractor) -> None: """Test category extraction when only a single breadcrumb code exists.""" category_line = MagicMock() first_part = MagicMock() first_part.attrs = {"href": "/s-kleidung/c42"} with ( patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = [category_line]) as mock_web_find, patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [first_part]) as mock_web_find_all, ): result = await extractor._extract_category_from_ad_page() assert result == "42/42" mock_web_find.assert_awaited_once_with(By.ID, "vap-brdcrmb") mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_category_fallback_to_legacy_selectors(self, extractor:extract_module.AdExtractor, caplog:pytest.LogCaptureFixture) -> None: """Test category extraction when breadcrumb links are not available and legacy selectors are used.""" category_line = MagicMock() first_part = MagicMock() first_part.attrs = {"href": 12345} # Ensure str() conversion happens second_part = MagicMock() second_part.attrs = {"href": 67890} # This will need str() conversion caplog.set_level("DEBUG") expected_message = _("Falling back to legacy breadcrumb selectors; collected ids: %s") % [] with ( patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError) as mock_web_find_all, ): mock_web_find.side_effect = [category_line, first_part, second_part] result = await extractor._extract_category_from_ad_page() assert result == "12345/67890" assert sum(1 for record in caplog.records if record.message == expected_message) == 1 mock_web_find.assert_any_call(By.ID, "vap-brdcrmb") mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) mock_web_find_all.assert_awaited_once_with(By.CSS_SELECTOR, "a", parent = category_line) @pytest.mark.asyncio async def test_extract_category_legacy_selectors_timeout(self, extractor:extract_module.AdExtractor, caplog:pytest.LogCaptureFixture) -> None: """Ensure fallback timeout logs the error and re-raises with translated message.""" category_line = MagicMock() async def fake_web_find(selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element: if selector_type == By.ID and selector_value == "vap-brdcrmb": return category_line raise TimeoutError("legacy selectors missing") with ( patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = fake_web_find), patch.object(extractor, "web_find_all", new_callable = AsyncMock, side_effect = TimeoutError), caplog.at_level("ERROR"), pytest.raises(TimeoutError, match = "Unable to locate breadcrumb fallback selectors"), ): await extractor._extract_category_from_ad_page() assert any("Legacy breadcrumb selectors not found" in record.message for record in caplog.records) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_special_attributes_empty(self, extractor:extract_module.AdExtractor) -> None: """Test extraction of special attributes when empty.""" with patch.object(extractor, "web_execute", new_callable = AsyncMock) as mock_web_execute: mock_web_execute.return_value = {"universalAnalyticsOpts": {"dimensions": {"ad_attributes": ""}}} result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value) assert result == {} @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_special_attributes_not_empty(self, extractor:extract_module.AdExtractor) -> None: """Test extraction of special attributes when not empty.""" special_atts = { "universalAnalyticsOpts": { "dimensions": {"ad_attributes": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"} } } result = await extractor._extract_special_attributes_from_ad_page(special_atts) assert len(result) == 5 assert "versand_s" not in result assert "color_s" in result assert result["color_s"] == "creme" assert "groesse_s" in result assert result["groesse_s"] == "68" assert "condition_s" in result assert result["condition_s"] == "alright" assert "type_s" in result assert result["type_s"] == "accessoires" assert "art_s" in result assert result["art_s"] == "maedchen" @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_special_attributes_missing_ad_attributes(self, extractor:extract_module.AdExtractor) -> None: """Test extraction of special attributes when ad_attributes key is missing.""" belen_conf:dict[str, Any] = { "universalAnalyticsOpts": { "dimensions": { # ad_attributes key is completely missing } } } result = await extractor._extract_special_attributes_from_ad_page(belen_conf) assert result == {} class TestAdExtractorContact: """Tests for contact information extraction.""" @pytest.fixture def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor: browser_mock = MagicMock(spec = Browser) config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}}) return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads")) @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_contact_info(self, extractor:extract_module.AdExtractor) -> None: """Test extraction of contact information.""" with ( patch.object(extractor, "page", MagicMock()), patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, ): mock_web_text.side_effect = [ "12345 Berlin - Mitte", "Example Street 123,", "Test User", ] mock_web_find.side_effect = [ MagicMock(), # contact person element MagicMock(), # name element TimeoutError(), # phone element (simulating no phone) ] contact_info = await extractor._extract_contact_from_ad_page() assert contact_info.street == "Example Street 123" assert contact_info.zipcode == "12345" assert contact_info.location == "Berlin - Mitte" assert contact_info.name == "Test User" assert contact_info.phone is None @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_contact_info_timeout(self, extractor:extract_module.AdExtractor) -> None: """Test contact info extraction when elements are not found.""" with ( patch.object(extractor, "page", MagicMock()), patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError()), patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError()), pytest.raises(TimeoutError), ): await extractor._extract_contact_from_ad_page() @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_contact_info_with_phone(self, extractor:extract_module.AdExtractor) -> None: """Test extraction of contact information including phone number.""" with ( patch.object(extractor, "page", MagicMock()), patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find, ): mock_web_text.side_effect = ["12345 Berlin - Mitte", "Example Street 123,", "Test User", "+49(0)1234 567890"] phone_element = MagicMock() mock_web_find.side_effect = [ MagicMock(), # contact person element MagicMock(), # name element phone_element, # phone element ] contact_info = await extractor._extract_contact_from_ad_page() assert contact_info.phone == "01234567890" # Normalized phone number class TestAdExtractorDownload: """Tests for download functionality.""" @pytest.fixture def extractor(self, test_bot_config:Config) -> extract_module.AdExtractor: browser_mock = MagicMock(spec = Browser) config = test_bot_config.with_values({"ad_defaults": {"description": {"prefix": "Test Prefix", "suffix": "Test Suffix"}}}) return extract_module.AdExtractor(browser_mock, config, Path("downloaded-ads")) @pytest.mark.asyncio async def test_download_ad(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test downloading an ad - directory creation and saving ad data.""" # Use tmp_path for OS-agnostic path handling download_base = tmp_path / "downloaded-ads" final_dir = download_base / "ad_12345_Test Advertisement Title" yaml_path = final_dir / "ad_12345.yaml" extractor.download_dir = download_base with ( patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir, ): mock_extract_with_dir.return_value = ( AdPartial.model_validate( { "title": "Test Advertisement Title", "description": "Test Description", "category": "Dienstleistungen", "price": 100, "images": [], "contact": {"name": "Test User", "street": "Test Street 123", "zipcode": "12345", "location": "Test City"}, } ), str(final_dir), ) await extractor.download_ad(12345) # Verify observable behavior: extraction and save were called mock_extract_with_dir.assert_called_once() mock_save_dict.assert_called_once() # Verify saved to correct location with correct data actual_call = mock_save_dict.call_args actual_path = Path(actual_call[0][0]) assert actual_path == yaml_path assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump(mode = "json") @pytest.mark.asyncio async def test_download_ad_writes_schema_compliant_yaml(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test that downloaded ad YAML validates against ad.schema.json.""" download_base = tmp_path / "downloaded-ads" final_dir = download_base / "ad_12345_Test Advertisement Title" yaml_path = final_dir / "ad_12345.yaml" extractor.download_dir = download_base with patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir: mock_extract_with_dir.return_value = ( AdPartial.model_validate( { "title": "Test Advertisement Title", "description": "Test Description", "category": "Dienstleistungen", "created_on": "2026-03-08T00:00:00+01:00", "updated_on": "2026-03-09T01:02:03+01:00", } ), final_dir, ) await extractor.download_ad(12345) loaded_ad = YAML(typ = "safe").load(await asyncio.to_thread(_read_text_file, yaml_path)) schema = json.loads(await asyncio.to_thread(_read_text_file, SCHEMA_PATH)) Draft202012Validator(schema).validate(loaded_ad) assert isinstance(loaded_ad["created_on"], str) assert isinstance(loaded_ad["updated_on"], str) @pytest.mark.asyncio # pylint: disable=protected-access async def test_download_images_no_images(self, extractor:extract_module.AdExtractor) -> None: """Test image download when no images are found.""" with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError): image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345) assert len(image_paths) == 0 @pytest.mark.asyncio # pylint: disable=protected-access async def test_download_images_with_none_url(self, extractor:extract_module.AdExtractor) -> None: """Test image download when some images have None as src attribute.""" image_box_mock = MagicMock() # Create image elements - one with valid src, one with None src img_with_url = MagicMock() img_with_url.attrs = {"src": "http://example.com/valid_image.jpg"} img_without_url = MagicMock() img_without_url.attrs = {"src": None} with ( patch.object(extractor, "web_find", new_callable = AsyncMock, return_value = image_box_mock), patch.object(extractor, "web_find_all", new_callable = AsyncMock, return_value = [img_with_url, img_without_url]), patch.object(extract_module.AdExtractor, "_download_and_save_image_sync", return_value = "/some/dir/ad_12345__img1.jpg"), ): image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345) # Should only download the one valid image (skip the None) assert len(image_paths) == 1 assert image_paths[0] == "ad_12345__img1.jpg" @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_ad_page_info_with_directory_handling_final_dir_exists(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test directory handling when final_dir already exists - it should be deleted.""" base_dir = tmp_path / "downloaded-ads" base_dir.mkdir() # Create the final directory that should be deleted final_dir = base_dir / "ad_12345_Test Title" final_dir.mkdir() old_file = final_dir / "old_file.txt" old_file.write_text("old content") # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" extractor.page = page_mock with ( patch.object( extractor, "web_text", new_callable = AsyncMock, side_effect = [ "Test Title", # Title extraction "Test Title", # Second title call for full extraction "Description text", # Description "03.02.2025", # Creation date ], ), patch.object( extractor, "web_execute", new_callable = AsyncMock, return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}, ), patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), patch.object( extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial( name = "Test", zipcode = "12345", location = "Berlin", ), ), ): ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345) # Verify the old directory was deleted and recreated assert result_dir == final_dir assert result_dir.exists() assert not old_file.exists() # Old file should be gone assert ad_cfg.title == "Test Title" @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_ad_page_info_with_directory_handling_rename_enabled(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test directory handling when temp_dir exists and rename_existing_folders is True.""" base_dir = tmp_path / "downloaded-ads" base_dir.mkdir() # Create the temp directory (without title) temp_dir = base_dir / "ad_12345" temp_dir.mkdir() existing_file = temp_dir / "existing_image.jpg" existing_file.write_text("existing image data") # Enable rename_existing_folders in config extractor.config.download.rename_existing_folders = True # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" extractor.page = page_mock with ( patch.object( extractor, "web_text", new_callable = AsyncMock, side_effect = [ "Test Title", # Title extraction "Test Title", # Second title call for full extraction "Description text", # Description "03.02.2025", # Creation date ], ), patch.object( extractor, "web_execute", new_callable = AsyncMock, return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}, ), patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), patch.object( extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial( name = "Test", zipcode = "12345", location = "Berlin", ), ), ): ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345) # Verify the directory was renamed from temp_dir to final_dir final_dir = base_dir / "ad_12345_Test Title" assert result_dir == final_dir assert result_dir.exists() assert not temp_dir.exists() # Old temp dir should be gone assert (result_dir / "existing_image.jpg").exists() # File should be preserved assert ad_cfg.title == "Test Title" @pytest.mark.asyncio # pylint: disable=protected-access async def test_extract_ad_page_info_with_directory_handling_use_existing(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test directory handling when temp_dir exists and rename_existing_folders is False (default).""" base_dir = tmp_path / "downloaded-ads" base_dir.mkdir() # Create the temp directory (without title) temp_dir = base_dir / "ad_12345" temp_dir.mkdir() existing_file = temp_dir / "existing_image.jpg" existing_file.write_text("existing image data") # Ensure rename_existing_folders is False (default) extractor.config.download.rename_existing_folders = False # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" extractor.page = page_mock with ( patch.object( extractor, "web_text", new_callable = AsyncMock, side_effect = [ "Test Title", # Title extraction "Test Title", # Second title call for full extraction "Description text", # Description "03.02.2025", # Creation date ], ), patch.object( extractor, "web_execute", new_callable = AsyncMock, return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}, ), patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), patch.object( extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial( name = "Test", zipcode = "12345", location = "Berlin", ), ), ): ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345) # Verify the existing temp_dir was used (not renamed) assert result_dir == temp_dir assert result_dir.exists() assert (result_dir / "existing_image.jpg").exists() # File should be preserved assert ad_cfg.title == "Test Title" @pytest.mark.asyncio async def test_download_ad_with_umlauts_in_title(self, extractor:extract_module.AdExtractor, tmp_path:Path) -> None: """Test cross-platform Unicode handling for ad titles with umlauts (issue #728). Verifies that: 1. Directories are created with NFC-normalized names (via sanitize_folder_name) 2. Files can be saved to those directories (via save_dict's NFC normalization) 3. No FileNotFoundError occurs due to NFC/NFD mismatch on Linux/Windows """ # Title with German umlauts (ä) - common in real ads title_with_umlauts = "KitchenAid Zuhälter - nie benutzt" # Mock the page page_mock = MagicMock() page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" extractor.page = page_mock base_dir = tmp_path / "downloaded-ads" base_dir.mkdir() with ( patch.object( extractor, "web_text", new_callable = AsyncMock, side_effect = [ title_with_umlauts, # Title extraction title_with_umlauts, # Second title call for full extraction "Description text", # Description "03.02.2025", # Creation date ], ), patch.object( extractor, "web_execute", new_callable = AsyncMock, return_value = {"universalAnalyticsOpts": {"dimensions": {"l3_category_id": "", "ad_attributes": ""}}}, ), patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), patch.object( extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial( name = "Test", zipcode = "12345", location = "Berlin", ), ), ): ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(base_dir, 12345) # Verify directory was created with NFC-normalized name assert result_dir.exists() assert ad_cfg.title == title_with_umlauts # Test saving YAML file to the Unicode directory path # Before fix: Failed on Linux/Windows due to NFC/NFD mismatch # After fix: Both directory and file use NFC normalization ad_file_path = Path(result_dir) / "ad_12345.yaml" from kleinanzeigen_bot.utils import dicts # noqa: PLC0415 header_string = ( "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json" ) # save_dict normalizes path to NFC, matching the NFC directory name dicts.save_dict(str(ad_file_path), ad_cfg.model_dump(), header = header_string) # Verify file was created successfully (no FileNotFoundError) assert ad_file_path.exists() assert ad_file_path.is_file()