Files
kleinanzeigen-bot/tests/unit/test_extract.py
Jens 36ca178574 feat: upgrade nodriver from 0.39 to 0.47 (#635)
## ℹ️ Description
Upgrade nodriver dependency from pinned version 0.39.0 to latest 0.47.0
to resolve browser startup issues and JavaScript evaluation problems
that affected versions 0.40-0.44.

- Link to the related issue(s): Resolves nodriver compatibility issues
- This upgrade addresses browser startup problems and window.BelenConf
evaluation failures that were blocking the use of newer nodriver
versions.

## 📋 Changes Summary

- Updated nodriver dependency from pinned 0.39.0 to >=0.47.0 in
pyproject.toml
- Fixed RemoteObject handling in web_execute method for nodriver 0.47
compatibility
- Added comprehensive BelenConf test fixture with real production data
structure
- Added integration test to validate window.BelenConf evaluation works
correctly
- Added German translation for new error message
- Replaced real user data with privacy-safe dummy data in test fixtures

### 🔧 Type Safety Improvements

**Added explicit `str()` conversions to resolve type inference issues:**

The comprehensive BelenConf test fixture contains deeply nested data
structures that caused pyright's type checker to infer complex
dictionary types throughout the codebase. To ensure type safety and
prevent runtime errors, I added explicit `str()` conversions in key
locations:

- **CSRF tokens**: `str(csrf_token)` - Ensures CSRF tokens are treated
as strings
- **Special attributes**: `str(special_attribute_value)` - Converts
special attribute values to strings
- **DOM attributes**: `str(special_attr_elem.attrs.id)` - Ensures
element IDs are strings
- **URL handling**: `str(current_img_url)` and `str(href_attributes)` -
Converts URLs and href attributes to strings
- **Price values**: `str(ad_cfg.price)` - Ensures price values are
strings

These conversions are defensive programming measures that ensure
backward compatibility and prevent type-related runtime errors, even if
the underlying data structures change in the future.

### ⚙️ Type of Change
- [x]  New feature (adds new functionality without breaking existing
usage)
- [ ] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

##  Checklist
Before requesting a review, confirm the following:
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
2025-10-12 21:22:46 +02:00

989 lines
45 KiB
Python

# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import json, os # isort: skip
from typing import Any, TypedDict
from unittest.mock import AsyncMock, MagicMock, call, patch
import pytest
from kleinanzeigen_bot.extract import AdExtractor
from kleinanzeigen_bot.model.ad_model import AdPartial, ContactPartial
from kleinanzeigen_bot.model.config_model import Config, DownloadConfig
from kleinanzeigen_bot.utils.web_scraping_mixin import Browser, By, Element
class _DimensionsDict(TypedDict):
dimension108:str
class _UniversalAnalyticsOptsDict(TypedDict):
dimensions:_DimensionsDict
class _BelenConfDict(TypedDict):
universalAnalyticsOpts:_UniversalAnalyticsOptsDict
class _SpecialAttributesDict(TypedDict, total = False):
art_s:str
condition_s:str
class _TestCaseDict(TypedDict): # noqa: PYI049 Private TypedDict `...` is never used
belen_conf:_BelenConfDict
expected:_SpecialAttributesDict
class TestAdExtractorBasics:
"""Basic synchronous tests for AdExtractor."""
def test_constructor(self, browser_mock:MagicMock, test_bot_config:Config) -> None:
"""Test the constructor of AdExtractor"""
extractor = AdExtractor(browser_mock, test_bot_config)
assert extractor.browser == browser_mock
assert extractor.config == test_bot_config
@pytest.mark.parametrize(("url", "expected_id"), [
("https://www.kleinanzeigen.de/s-anzeige/test-title/12345678", 12345678),
("https://www.kleinanzeigen.de/s-anzeige/another-test/98765432", 98765432),
("https://www.kleinanzeigen.de/s-anzeige/invalid-id/abc", -1),
("https://www.kleinanzeigen.de/invalid-url", -1),
])
def test_extract_ad_id_from_ad_url(self, test_extractor:AdExtractor, url:str, expected_id:int) -> None:
"""Test extraction of ad ID from different URL formats."""
assert test_extractor.extract_ad_id_from_ad_url(url) == expected_id
class TestAdExtractorPricing:
"""Tests for pricing related functionality."""
@pytest.mark.parametrize(("price_text", "expected_price", "expected_type"), [
("50 €", 50, "FIXED"),
("1.234 €", 1234, "FIXED"),
("50 € VB", 50, "NEGOTIABLE"),
("VB", None, "NEGOTIABLE"),
("Zu verschenken", None, "GIVE_AWAY"),
])
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info(
self, test_extractor:AdExtractor, price_text:str, expected_price:int | None, expected_type:str
) -> None:
"""Test price extraction with different formats"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = price_text):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price == expected_price
assert price_type == expected_type
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_pricing_info_timeout(self, test_extractor:AdExtractor) -> None:
"""Test price extraction when element is not found"""
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError):
price, price_type = await test_extractor._extract_pricing_info_from_ad_page()
assert price is None
assert price_type == "NOT_APPLICABLE"
class TestAdExtractorShipping:
"""Tests for shipping related functionality."""
@pytest.mark.parametrize(("shipping_text", "expected_type", "expected_cost"), [
("+ Versand ab 2,99 €", "SHIPPING", 2.99),
("Nur Abholung", "PICKUP", None),
("Versand möglich", "SHIPPING", None),
])
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info(
self, test_extractor:AdExtractor, shipping_text:str, expected_type:str, expected_cost:float | None
) -> None:
"""Test shipping info extraction with different text formats."""
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = shipping_text), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock) as mock_web_request:
if expected_cost:
shipping_response:dict[str, Any] = {
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": int(expected_cost * 100), "packageSize": "SMALL"}
]
}
}
}
mock_web_request.return_value = {"content": json.dumps(shipping_response)}
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == expected_type
assert costs == expected_cost
if expected_cost:
assert options == ["DHL_2"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with shipping options."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "DHL_001", "priceInEuroCent": 549, "packageSize": "SMALL"}
]
}
}
})
}
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 5,49 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 5.49
assert options == ["DHL_2"]
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_all_matching_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with all matching options enabled."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}
]
}
}
})
}
# Enable all matching options in config
test_extractor.config.download = DownloadConfig.model_validate({"include_all_matching_shipping_options": True})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["DHL_2", "Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_options(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction with excluded options."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"},
{"id": "DHL_001", "priceInEuroCent": 619, "packageSize": "SMALL"}
]
}
}
})
}
# Enable all matching options and exclude DHL in config
test_extractor.config.download = DownloadConfig.model_validate({
"include_all_matching_shipping_options": True,
"excluded_shipping_options": ["DHL_2"]
})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "SHIPPING"
assert costs == 4.89
if options is not None:
assert sorted(options) == ["Hermes_Päckchen", "Hermes_S"]
else:
assert options is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_shipping_info_with_excluded_matching_option(self, test_extractor:AdExtractor) -> None:
"""Test shipping info extraction when the matching option is excluded."""
shipping_response = {
"content": json.dumps({
"data": {
"shippingOptionsResponse": {
"options": [
{"id": "HERMES_001", "priceInEuroCent": 489, "packageSize": "SMALL"},
{"id": "HERMES_002", "priceInEuroCent": 549, "packageSize": "SMALL"}
]
}
}
})
}
# Exclude the matching option
test_extractor.config.download = DownloadConfig.model_validate({
"excluded_shipping_options": ["Hermes_Päckchen"]
})
with patch.object(test_extractor, "page", MagicMock()), \
patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = "+ Versand ab 4,89 €"), \
patch.object(test_extractor, "web_request", new_callable = AsyncMock, return_value = shipping_response):
shipping_type, costs, options = await test_extractor._extract_shipping_info_from_ad_page()
assert shipping_type == "NOT_APPLICABLE"
assert costs == 4.89
assert options is None
class TestAdExtractorNavigation:
"""Tests for navigation related functionality."""
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_url(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page using a URL."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
result = await test_extractor.navigate_to_ad_page("https://www.kleinanzeigen.de/s-anzeige/test/12345")
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-anzeige/test/12345")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_id(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page using an ID."""
ad_id = 12345
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/{0}".format(ad_id)
popup_close_mock = AsyncMock()
popup_close_mock.click = AsyncMock()
popup_close_mock.apply = AsyncMock(return_value = True)
def find_mock(selector_type:By, selector_value:str, **_:Any) -> Element | None:
if selector_type == By.CLASS_NAME and selector_value == "mfp-close":
return popup_close_mock
return None
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock) as mock_web_open, \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, side_effect = find_mock):
result = await test_extractor.navigate_to_ad_page(ad_id)
assert result is True
mock_web_open.assert_called_with("https://www.kleinanzeigen.de/s-suchanfrage.html?keywords={0}".format(ad_id))
popup_close_mock.click.assert_awaited_once()
@pytest.mark.asyncio
async def test_navigate_to_ad_page_with_popup(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page with popup handling."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock), \
patch.object(test_extractor, "web_click", new_callable = AsyncMock) as mock_web_click, \
patch.object(test_extractor, "web_check", new_callable = AsyncMock, return_value = True):
result = await test_extractor.navigate_to_ad_page(12345)
assert result is True
mock_web_click.assert_called_with(By.CLASS_NAME, "mfp-close")
@pytest.mark.asyncio
async def test_navigate_to_ad_page_invalid_id(self, test_extractor:AdExtractor) -> None:
"""Test navigation to ad page with invalid ID."""
page_mock = AsyncMock()
page_mock.url = "https://www.kleinanzeigen.de/s-suchen.html?k0"
input_mock = AsyncMock()
input_mock.clear_input = AsyncMock()
input_mock.send_keys = AsyncMock()
input_mock.apply = AsyncMock(return_value = True)
input_mock.attrs = {}
with patch.object(test_extractor, "page", page_mock), \
patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock, return_value = input_mock):
result = await test_extractor.navigate_to_ad_page(99999)
assert result is False
@pytest.mark.asyncio
async def test_extract_own_ads_urls(self, test_extractor:AdExtractor) -> None:
"""Test extraction of own ads URLs - basic test."""
with patch.object(test_extractor, "web_open", new_callable = AsyncMock), \
patch.object(test_extractor, "web_sleep", new_callable = AsyncMock), \
patch.object(test_extractor, "web_find", new_callable = AsyncMock) as mock_web_find, \
patch.object(test_extractor, "web_find_all", new_callable = AsyncMock) as mock_web_find_all, \
patch.object(test_extractor, "web_scroll_page_down", new_callable = AsyncMock), \
patch.object(test_extractor, "web_execute", new_callable = AsyncMock):
# --- Setup mock objects for DOM elements ---
# Mocks needed for the actual execution flow
ad_list_container_mock = MagicMock()
pagination_section_mock = MagicMock()
cardbox_mock = MagicMock() # Represents the <li> element
link_mock = MagicMock() # Represents the <a> element
link_mock.attrs = {"href": "/s-anzeige/test/12345"} # Configure the desired output
# Mocks for elements potentially checked but maybe not strictly needed for output
# (depending on how robust the mocking is)
# next_button_mock = MagicMock() # If needed for multi_page logic
# --- Setup mock responses for web_find and web_find_all in CORRECT ORDER ---
# 1. Initial find for ad list container (before loop)
# 2. Find for pagination section (pagination check)
# 3. Find for ad list container (inside loop)
# 4. Find for the link (inside list comprehension)
mock_web_find.side_effect = [
ad_list_container_mock, # Call 1: find #my-manageitems-adlist (before loop)
pagination_section_mock, # Call 2: find .Pagination
ad_list_container_mock, # Call 3: find #my-manageitems-adlist (inside loop)
link_mock # Call 4: find 'div.manageitems-item-ad h3 a.text-onSurface'
# Add more mocks here if the pagination navigation logic calls web_find again
]
# 1. Find all 'Nächste' buttons (pagination check) - Return empty list for single page test case
# 2. Find all '.cardbox' elements (inside loop)
mock_web_find_all.side_effect = [
[], # Call 1: find 'button[aria-label="Nächste"]' -> No next button = single page
[cardbox_mock] # Call 2: find .cardbox -> One ad item
# Add more mocks here if pagination navigation calls web_find_all
]
# --- Execute test and verify results ---
refs = await test_extractor.extract_own_ads_urls()
# --- Assertions ---
assert refs == ["/s-anzeige/test/12345"] # Now it should match
# Optional: Verify calls were made as expected
mock_web_find.assert_has_calls([
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, ".Pagination", timeout = 10),
call(By.ID, "my-manageitems-adlist"),
call(By.CSS_SELECTOR, "div h3 a.text-onSurface", parent = cardbox_mock),
], any_order = False) # Check order if important
mock_web_find_all.assert_has_calls([
call(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section_mock),
call(By.CLASS_NAME, "cardbox", parent = ad_list_container_mock),
], any_order = False)
class TestAdExtractorContent:
"""Tests for content extraction functionality."""
# pylint: disable=protected-access
@pytest.fixture
def extractor_with_config(self) -> AdExtractor:
"""Create extractor with specific config for testing prefix/suffix handling."""
browser_mock = MagicMock(spec = Browser)
return AdExtractor(browser_mock, Config()) # Empty config, will be overridden in tests
@pytest.mark.asyncio
async def test_extract_description_with_affixes(
self,
test_extractor:AdExtractor,
description_test_cases:list[tuple[dict[str, Any], str, str]],
test_bot_config:Config
) -> None:
"""Test extraction of description with various prefix/suffix configurations."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
for config, raw_description, _ in description_test_cases: # Changed to _ since we don't use expected_description
test_extractor.config = test_bot_config.with_values(config)
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title
raw_description, # Raw description (without affixes)
"03.02.2025" # Creation date
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = {})
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_description_with_affixes_timeout(
self,
test_extractor:AdExtractor
) -> None:
"""Test handling of timeout when extracting description."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title succeeds
TimeoutError("Timeout"), # Description times out
"03.02.2025" # Date succeeds
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial())
):
try:
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert not info.description
except TimeoutError:
# This is also acceptable - depends on how we want to handle timeouts
pass
@pytest.mark.asyncio
async def test_extract_description_with_affixes_no_affixes(
self,
test_extractor:AdExtractor
) -> None:
"""Test extraction of description without any affixes in config."""
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
test_extractor.page = page_mock
raw_description = "Original Description"
with patch.multiple(test_extractor,
web_text = AsyncMock(side_effect = [
"Test Title", # Title
raw_description, # Description without affixes
"03.02.2025" # Creation date
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
_extract_shipping_info_from_ad_page = AsyncMock(return_value = ("NOT_APPLICABLE", None, None)),
_extract_sell_directly_from_ad_page = AsyncMock(return_value = False),
_download_images_from_ad_page = AsyncMock(return_value = []),
_extract_contact_from_ad_page = AsyncMock(return_value = ContactPartial())
):
info = await test_extractor._extract_ad_page_info("/some/dir", 12345)
assert info.description == raw_description
@pytest.mark.asyncio
async def test_extract_sell_directly(self, test_extractor:AdExtractor) -> None:
"""Test extraction of sell directly option."""
test_cases = [
("Direkt kaufen", True),
("Other text", False),
]
for text, expected in test_cases:
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, return_value = text):
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is expected
with patch.object(test_extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError):
result = await test_extractor._extract_sell_directly_from_ad_page()
assert result is None
class TestAdExtractorCategory:
"""Tests for category extraction functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category(self, extractor:AdExtractor) -> None:
"""Test category extraction from breadcrumb."""
category_line = MagicMock()
first_part = MagicMock()
first_part.attrs = {"href": "/s-familie-kind-baby/c17"}
second_part = MagicMock()
second_part.attrs = {"href": "/s-spielzeug/c23"}
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_find.side_effect = [
category_line,
first_part,
second_part
]
result = await extractor._extract_category_from_ad_page()
assert result == "17/23"
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_category_with_non_string_href(self, extractor:AdExtractor) -> None:
"""Test category extraction with non-string href attributes to cover str() conversion."""
category_line = MagicMock()
first_part = MagicMock()
# Use non-string href to test str() conversion
first_part.attrs = {"href": 12345} # This will need str() conversion
second_part = MagicMock()
second_part.attrs = {"href": 67890} # This will need str() conversion
with patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_find.side_effect = [
category_line,
first_part,
second_part
]
result = await extractor._extract_category_from_ad_page()
assert result == "2345/7890" # After str() conversion and slicing
mock_web_find.assert_any_call(By.ID, "vap-brdcrmb")
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
mock_web_find.assert_any_call(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_empty(self, extractor:AdExtractor) -> None:
"""Test extraction of special attributes when empty."""
with patch.object(extractor, "web_execute", new_callable = AsyncMock) as mock_web_execute:
mock_web_execute.return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension108": ""
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
assert result == {}
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_not_empty(self, extractor:AdExtractor) -> None:
"""Test extraction of special attributes when not empty."""
special_atts = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension108": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
assert len(result) == 5
assert "versand_s" not in result
assert "color_s" in result
assert result["color_s"] == "creme"
assert "groesse_s" in result
assert result["groesse_s"] == "68"
assert "condition_s" in result
assert result["condition_s"] == "alright"
assert "type_s" in result
assert result["type_s"] == "accessoires"
assert "art_s" in result
assert result["art_s"] == "maedchen"
class TestAdExtractorContact:
"""Tests for contact information extraction."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info(self, extractor:AdExtractor) -> None:
"""Test extraction of contact information."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, \
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_text.side_effect = [
"12345 Berlin - Mitte",
"Example Street 123,",
"Test User",
]
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
TimeoutError(), # phone element (simulating no phone)
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.street == "Example Street 123"
assert contact_info.zipcode == "12345"
assert contact_info.location == "Berlin - Mitte"
assert contact_info.name == "Test User"
assert contact_info.phone is None
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_timeout(self, extractor:AdExtractor) -> None:
"""Test contact info extraction when elements are not found."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = TimeoutError()), \
patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError()), \
pytest.raises(TimeoutError):
await extractor._extract_contact_from_ad_page()
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_contact_info_with_phone(self, extractor:AdExtractor) -> None:
"""Test extraction of contact information including phone number."""
with patch.object(extractor, "page", MagicMock()), \
patch.object(extractor, "web_text", new_callable = AsyncMock) as mock_web_text, \
patch.object(extractor, "web_find", new_callable = AsyncMock) as mock_web_find:
mock_web_text.side_effect = [
"12345 Berlin - Mitte",
"Example Street 123,",
"Test User",
"+49(0)1234 567890"
]
phone_element = MagicMock()
mock_web_find.side_effect = [
MagicMock(), # contact person element
MagicMock(), # name element
phone_element, # phone element
]
contact_info = await extractor._extract_contact_from_ad_page()
assert contact_info.phone == "01234567890" # Normalized phone number
class TestAdExtractorDownload:
"""Tests for download functionality."""
@pytest.fixture
def extractor(self, test_bot_config:Config) -> AdExtractor:
browser_mock = MagicMock(spec = Browser)
config = test_bot_config.with_values({
"ad_defaults": {
"description": {
"prefix": "Test Prefix",
"suffix": "Test Suffix"
}
}
})
return AdExtractor(browser_mock, config)
@pytest.mark.asyncio
async def test_download_ad_existing_directory(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when the directory already exists."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
existing_paths = {base_dir, final_dir} # Final directory with title exists
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
# Directory handling is now done inside _extract_ad_page_info_with_directory_handling
# so we don't expect rmtree/mkdir to be called directly in download_ad
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_not_called() # Directory handling is done internally
mock_makedirs.assert_not_called() # Directory already exists
mock_rename.assert_not_called() # No renaming needed
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad(self, extractor:AdExtractor) -> None:
"""Test downloading an entire ad."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
mock_exists.return_value = False
mock_isdir.return_value = False
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
# Directory handling is now done inside _extract_ad_page_info_with_directory_handling
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_has_calls([call(base_dir)]) # Only base directory creation
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # No renaming needed
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad_use_existing_folder(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when an old folder without title exists (default behavior)."""
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
temp_dir = os.path.join(base_dir, "ad_12345")
yaml_path = os.path.join(temp_dir, "ad_12345.yaml")
# Configure mocks for directory checks
# Base directory exists, temp directory exists
existing_paths = {base_dir, temp_dir}
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
temp_dir # Use existing temp directory
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once()
mock_rmtree.assert_not_called() # No directory to remove
mock_mkdir.assert_not_called() # Base directory already exists
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # No renaming (default behavior)
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
async def test_download_ad_rename_existing_folder_when_enabled(self, extractor:AdExtractor) -> None:
"""Test downloading an ad when an old folder without title exists and renaming is enabled."""
# Enable renaming in config
extractor.config.download.rename_existing_folders = True
with patch("os.path.exists") as mock_exists, \
patch("os.path.isdir") as mock_isdir, \
patch("os.makedirs") as mock_makedirs, \
patch("os.mkdir") as mock_mkdir, \
patch("os.rename") as mock_rename, \
patch("shutil.rmtree") as mock_rmtree, \
patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
base_dir = "downloaded-ads"
temp_dir = os.path.join(base_dir, "ad_12345")
final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
yaml_path = os.path.join(final_dir, "ad_12345.yaml")
# Configure mocks for directory checks
# Base directory exists, temp directory exists, final directory doesn't exist
existing_paths = {base_dir, temp_dir}
mock_exists.side_effect = lambda path: path in existing_paths
mock_isdir.side_effect = lambda path: path == base_dir
# Mock the new method that handles directory creation and extraction
mock_extract_with_dir.return_value = (
AdPartial.model_validate({
"title": "Test Advertisement Title",
"description": "Test Description",
"category": "Dienstleistungen",
"price": 100,
"images": [],
"contact": {
"name": "Test User",
"street": "Test Street 123",
"zipcode": "12345",
"location": "Test City"
}
}),
final_dir
)
await extractor.download_ad(12345)
# Verify the correct functions were called
mock_extract_with_dir.assert_called_once() # Extract to final directory
# Directory handling (including renaming) is now done inside _extract_ad_page_info_with_directory_handling
mock_rmtree.assert_not_called() # Directory handling is done internally
mock_mkdir.assert_not_called() # Directory handling is done internally
mock_makedirs.assert_not_called() # Using mkdir instead
mock_rename.assert_not_called() # Directory handling is done internally
# Get the actual call arguments
actual_call = mock_save_dict.call_args
assert actual_call is not None
actual_path = actual_call[0][0].replace("/", os.path.sep)
assert actual_path == yaml_path
assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_download_images_no_images(self, extractor:AdExtractor) -> None:
"""Test image download when no images are found."""
with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
assert len(image_paths) == 0