feat: add configurable timeouts (#673)

## ℹ️ Description
- Related issues: #671, #658
- Introduces configurable timeout controls plus retry/backoff handling
for flaky DOM operations.

We often see timeouts which are note reproducible in certain
configurations. I suspect timeout issues based on a combination of
internet speed, browser, os, age of the computer and the weather.

This PR introduces a comprehensive config model to tweak timeouts.

## 📋 Changes Summary
- add TimeoutConfig to the main config/schema and expose timeouts in
README/docs
- wire WebScrapingMixin, extractor, update checker, and browser
diagnostics to honor the configurable timeouts and retries
- update translations/tests to cover the new behaviour and ensure
lint/mypy/pyright pipelines remain green

### ⚙️ Type of Change
- [ ] 🐞 Bug fix (non-breaking change which fixes an issue)
- [x]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

##  Checklist
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Centralized, configurable timeout system for web interactions,
detection flows, publishing, and pagination.
* Optional retry with exponential backoff for operations that time out.

* **Improvements**
* Replaced fixed wait times with dynamic timeouts throughout workflows.
  * More informative timeout-related messages and diagnostics.

* **Tests**
* New and expanded test coverage for timeout behavior, pagination,
diagnostics, and retry logic.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Jens
2025-11-13 15:08:52 +01:00
committed by GitHub
parent ac678ed888
commit a3ac27c441
16 changed files with 972 additions and 121 deletions

View File

@@ -8,12 +8,14 @@ All rights reserved.
"""
import json
import logging
import os
import platform
import shutil
import zipfile
from collections.abc import Awaitable, Callable
from pathlib import Path
from typing import NoReturn, Protocol, cast
from typing import Any, NoReturn, Protocol, cast
from unittest.mock import AsyncMock, MagicMock, Mock, mock_open, patch
import nodriver
@@ -22,6 +24,7 @@ import pytest
from nodriver.core.element import Element
from nodriver.core.tab import Tab as Page
from kleinanzeigen_bot.model.config_model import Config
from kleinanzeigen_bot.utils import loggers
from kleinanzeigen_bot.utils.web_scraping_mixin import By, Is, WebScrapingMixin, _is_admin # noqa: PLC2701
@@ -32,7 +35,13 @@ class ConfigProtocol(Protocol):
browser_args:list[str]
user_data_dir:str | None
def add_extension(self, ext:str) -> None: ...
def add_extension(self, ext:str) -> None:
...
def _nodriver_start_mock() -> Mock:
"""Return the nodriver.start mock with proper typing."""
return cast(Mock, cast(Any, nodriver).start)
class TrulyAwaitableMockPage:
@@ -82,6 +91,7 @@ def web_scraper(mock_browser:AsyncMock, mock_page:TrulyAwaitableMockPage) -> Web
scraper = WebScrapingMixin()
scraper.browser = mock_browser
scraper.page = mock_page # type: ignore[unused-ignore,reportAttributeAccessIssue]
scraper.config = Config.model_validate({"login": {"username": "user@example.com", "password": "secret"}}) # noqa: S105
return scraper
@@ -156,6 +166,21 @@ class TestWebScrapingErrorHandling:
with pytest.raises(Exception, match = "Cannot clear input"):
await web_scraper.web_input(By.ID, "test-id", "test text")
@pytest.mark.asyncio
async def test_web_input_success_returns_element(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
"""Successful web_input should send keys, wait, and return the element."""
mock_element = AsyncMock(spec = Element)
mock_page.query_selector.return_value = mock_element
mock_sleep = AsyncMock()
cast(Any, web_scraper).web_sleep = mock_sleep
result = await web_scraper.web_input(By.ID, "username", "hello world", timeout = 1)
assert result is mock_element
mock_element.clear_input.assert_awaited_once()
mock_element.send_keys.assert_awaited_once_with("hello world")
mock_sleep.assert_awaited_once()
@pytest.mark.asyncio
async def test_web_open_timeout(self, web_scraper:WebScrapingMixin, mock_browser:AsyncMock) -> None:
"""Test page load timeout in web_open."""
@@ -173,6 +198,19 @@ class TestWebScrapingErrorHandling:
with pytest.raises(TimeoutError, match = "Page did not finish loading within"):
await web_scraper.web_open("https://example.com", timeout = 0.1)
@pytest.mark.asyncio
async def test_web_open_skip_when_url_already_loaded(self, web_scraper:WebScrapingMixin, mock_browser:AsyncMock, mock_page:TrulyAwaitableMockPage) -> None:
"""web_open should short-circuit when the requested URL is already active."""
mock_browser.get.reset_mock()
mock_page.url = "https://example.com"
mock_execute = AsyncMock()
cast(Any, web_scraper).web_execute = mock_execute
await web_scraper.web_open("https://example.com", reload_if_already_open = False)
mock_browser.get.assert_not_awaited()
mock_execute.assert_not_called()
@pytest.mark.asyncio
async def test_web_request_invalid_response(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
"""Test invalid response handling in web_request."""
@@ -216,6 +254,179 @@ class TestWebScrapingErrorHandling:
with pytest.raises(Exception, match = "Attribute error"):
await web_scraper.web_check(By.ID, "test-id", Is.DISPLAYED)
@pytest.mark.asyncio
async def test_web_find_applies_timeout_multiplier_and_backoff(self, web_scraper:WebScrapingMixin) -> None:
"""Ensure multiplier/backoff logic is honored when timeouts occur."""
assert web_scraper.config is not None
web_scraper.config.timeouts.multiplier = 2.0
web_scraper.config.timeouts.retry_enabled = True
web_scraper.config.timeouts.retry_max_attempts = 2
web_scraper.config.timeouts.retry_backoff_factor = 2.0
recorded:list[tuple[float, bool]] = []
async def fake_web_await(condition:Callable[[], object], *, timeout:float, timeout_error_message:str = "",
apply_multiplier:bool = True) -> Element:
recorded.append((timeout, apply_multiplier))
raise TimeoutError(timeout_error_message or "timeout")
cast(Any, web_scraper).web_await = fake_web_await
with pytest.raises(TimeoutError):
await web_scraper.web_find(By.ID, "test-id", timeout = 0.5)
assert recorded == [(1.0, False), (2.0, False), (4.0, False)]
class TestTimeoutAndRetryHelpers:
"""Test timeout helper utilities in WebScrapingMixin."""
def test_get_timeout_config_prefers_config_timeouts(self, web_scraper:WebScrapingMixin) -> None:
"""_get_timeout_config should return the config-provided timeout model when available."""
custom_config = Config.model_validate({
"login": {"username": "user@example.com", "password": "secret"}, # noqa: S105
"timeouts": {"default": 7.5}
})
web_scraper.config = custom_config
assert web_scraper._get_timeout_config() is custom_config.timeouts
def test_timeout_attempts_respects_retry_switch(self, web_scraper:WebScrapingMixin) -> None:
"""_timeout_attempts should collapse to a single attempt when retries are disabled."""
web_scraper.config.timeouts.retry_enabled = False
assert web_scraper._timeout_attempts() == 1
web_scraper.config.timeouts.retry_enabled = True
web_scraper.config.timeouts.retry_max_attempts = 3
assert web_scraper._timeout_attempts() == 4
@pytest.mark.asyncio
async def test_run_with_timeout_retries_retries_operation(self, web_scraper:WebScrapingMixin) -> None:
"""_run_with_timeout_retries should retry when TimeoutError is raised before succeeding."""
attempts:list[float] = []
async def flaky_operation(timeout:float) -> str:
attempts.append(timeout)
if len(attempts) == 1:
raise TimeoutError("first attempt")
return "done"
web_scraper.config.timeouts.retry_max_attempts = 1
result = await web_scraper._run_with_timeout_retries(flaky_operation, description = "retry-op")
assert result == "done"
assert len(attempts) == 2
@pytest.mark.asyncio
async def test_run_with_timeout_retries_guard_clause(self, web_scraper:WebScrapingMixin) -> None:
"""_run_with_timeout_retries should guard against zero-attempt edge cases."""
async def never_called(timeout:float) -> None:
pytest.fail("operation should not run when attempts are zero")
with patch.object(web_scraper, "_timeout_attempts", return_value = 0), \
pytest.raises(TimeoutError, match = "guarded-op failed without executing operation"):
await web_scraper._run_with_timeout_retries(never_called, description = "guarded-op")
class TestSelectorTimeoutMessages:
"""Ensure selector helpers provide informative timeout messages."""
@pytest.mark.asyncio
@pytest.mark.parametrize(
("selector_type", "selector_value", "expected_message"),
[
(By.TAG_NAME, "section", "No HTML element found of tag <section> within 2.0 seconds."),
(By.CSS_SELECTOR, ".hero", "No HTML element found using CSS selector '.hero' within 2.0 seconds."),
(By.TEXT, "Submit", "No HTML element found containing text 'Submit' within 2.0 seconds."),
(By.XPATH, "//div[@class='hero']", "No HTML element found using XPath '//div[@class='hero']' within 2.0 seconds."),
]
)
async def test_web_find_timeout_suffixes(
self,
web_scraper:WebScrapingMixin,
selector_type:By,
selector_value:str,
expected_message:str
) -> None:
"""web_find should pass descriptive timeout messages for every selector strategy."""
mock_element = AsyncMock(spec = Element)
mock_wait = AsyncMock(return_value = mock_element)
cast(Any, web_scraper).web_await = mock_wait
result = await web_scraper.web_find(selector_type, selector_value, timeout = 2)
assert result is mock_element
call = mock_wait.await_args_list[0]
assert expected_message == call.kwargs["timeout_error_message"]
assert call.kwargs["apply_multiplier"] is False
@pytest.mark.asyncio
@pytest.mark.parametrize(
("selector_type", "selector_value", "expected_message"),
[
(By.CLASS_NAME, "hero", "No HTML elements found with CSS class 'hero' within 1 seconds."),
(By.CSS_SELECTOR, ".card", "No HTML elements found using CSS selector '.card' within 1 seconds."),
(By.TAG_NAME, "article", "No HTML elements found of tag <article> within 1 seconds."),
(By.TEXT, "Listings", "No HTML elements found containing text 'Listings' within 1 seconds."),
(By.XPATH, "//footer", "No HTML elements found using XPath '//footer' within 1 seconds."),
]
)
async def test_web_find_all_once_timeout_suffixes(
self,
web_scraper:WebScrapingMixin,
selector_type:By,
selector_value:str,
expected_message:str
) -> None:
"""_web_find_all_once should surface informative timeout errors for each selector."""
elements = [AsyncMock(spec = Element)]
mock_wait = AsyncMock(return_value = elements)
cast(Any, web_scraper).web_await = mock_wait
result = await web_scraper._web_find_all_once(selector_type, selector_value, 1)
assert result is elements
call = mock_wait.await_args_list[0]
assert expected_message == call.kwargs["timeout_error_message"]
assert call.kwargs["apply_multiplier"] is False
@pytest.mark.asyncio
async def test_web_find_all_delegates_to_retry_helper(self, web_scraper:WebScrapingMixin) -> None:
"""web_find_all should execute via the timeout retry helper."""
elements = [AsyncMock(spec = Element)]
async def fake_retry(operation:Callable[[float], Awaitable[list[Element]]], **kwargs:Any) -> list[Element]:
assert kwargs["description"] == "web_find_all(CLASS_NAME, hero)"
assert kwargs["override"] == 1.5
result = await operation(0.42)
return result
retry_mock = AsyncMock(side_effect = fake_retry)
once_mock = AsyncMock(return_value = elements)
cast(Any, web_scraper)._run_with_timeout_retries = retry_mock
cast(Any, web_scraper)._web_find_all_once = once_mock
result = await web_scraper.web_find_all(By.CLASS_NAME, "hero", timeout = 1.5)
assert result is elements
retry_call = retry_mock.await_args_list[0]
assert retry_call.kwargs["key"] == "default"
assert retry_call.kwargs["override"] == 1.5
once_call = once_mock.await_args_list[0]
assert once_call.args[:2] == (By.CLASS_NAME, "hero")
assert once_call.args[2] == 0.42
@pytest.mark.asyncio
async def test_web_check_unsupported_attribute(self, web_scraper:WebScrapingMixin, mock_page:TrulyAwaitableMockPage) -> None:
"""web_check should raise for unsupported attribute queries."""
mock_element = AsyncMock(spec = Element)
mock_element.attrs = {}
mock_page.query_selector.return_value = mock_element
with pytest.raises(AssertionError, match = "Unsupported attribute"):
await web_scraper.web_check(By.ID, "test-id", cast(Is, object()), timeout = 0.1)
class TestWebScrapingSessionManagement:
"""Test session management edge cases in WebScrapingMixin."""
@@ -295,9 +506,42 @@ class TestWebScrapingSessionManagement:
with patch("psutil.Process") as mock_proc:
mock_proc.return_value.children.return_value = []
scraper.close_browser_session()
stop_mock.assert_called_once()
assert scraper.browser is None
assert scraper.page is None
stop_mock.assert_called_once()
assert scraper.browser is None
assert scraper.page is None
class TestWebScrolling:
"""Test scrolling helpers."""
@pytest.mark.asyncio
async def test_web_scroll_page_down_scrolls_and_returns(self, web_scraper:WebScrapingMixin) -> None:
"""web_scroll_page_down should scroll both directions when requested."""
scripts:list[str] = []
async def exec_side_effect(script:str) -> int | None:
scripts.append(script)
if script == "document.body.scrollHeight":
return 20
return None
cast(Any, web_scraper).web_execute = AsyncMock(side_effect = exec_side_effect)
with patch("kleinanzeigen_bot.utils.web_scraping_mixin.asyncio.sleep", new_callable = AsyncMock) as mock_sleep:
await web_scraper.web_scroll_page_down(scroll_length = 10, scroll_speed = 10, scroll_back_top = True)
assert scripts[0] == "document.body.scrollHeight"
# Expect four scrollTo operations: two down, two up
assert scripts.count("document.body.scrollHeight") == 1
scroll_calls = [script for script in scripts if script.startswith("window.scrollTo")]
assert scroll_calls == [
"window.scrollTo(0, 10)",
"window.scrollTo(0, 20)",
"window.scrollTo(0, 10)",
"window.scrollTo(0, 0)"
]
sleep_durations = [call.args[0] for call in mock_sleep.await_args_list]
assert sleep_durations == [1.0, 1.0, 0.5, 0.5]
@pytest.mark.asyncio
async def test_session_expiration_handling(self, web_scraper:WebScrapingMixin, mock_browser:AsyncMock) -> None:
@@ -468,7 +712,7 @@ class TestWebScrapingBrowserConfiguration:
def add_extension(self, ext:str) -> None:
self._extensions.append(ext) # Use private extensions list
# Mock nodriver.start to return a mock browser # type: ignore[attr-defined]
# Mock nodriver.start to return a mock browser
mock_browser = AsyncMock()
mock_browser.websocket_url = "ws://localhost:9222"
monkeypatch.setattr(nodriver, "start", AsyncMock(return_value = mock_browser))
@@ -557,7 +801,7 @@ class TestWebScrapingBrowserConfiguration:
def add_extension(self, ext:str) -> None:
self.extensions.append(ext)
# Mock nodriver.start to return a mock browser # type: ignore[attr-defined]
# Mock nodriver.start to return a mock browser
mock_browser = AsyncMock()
mock_browser.websocket_url = "ws://localhost:9222"
monkeypatch.setattr(nodriver, "start", AsyncMock(return_value = mock_browser))
@@ -576,7 +820,7 @@ class TestWebScrapingBrowserConfiguration:
await scraper.create_browser_session()
# Verify browser arguments
config = cast(Mock, nodriver.start).call_args[0][0] # type: ignore[attr-defined]
config = _nodriver_start_mock().call_args[0][0]
assert "--custom-arg=value" in config.browser_args
assert "--another-arg" in config.browser_args
assert "--incognito" in config.browser_args
@@ -589,7 +833,7 @@ class TestWebScrapingBrowserConfiguration:
await scraper.create_browser_session()
# Verify Edge-specific arguments
config = cast(Mock, nodriver.start).call_args[0][0] # type: ignore[attr-defined]
config = _nodriver_start_mock().call_args[0][0]
assert "-inprivate" in config.browser_args
assert os.environ.get("MSEDGEDRIVER_TELEMETRY_OPTOUT") == "1"
@@ -620,7 +864,7 @@ class TestWebScrapingBrowserConfiguration:
with zipfile.ZipFile(ext2, "w") as z:
z.writestr("manifest.json", '{"name": "Test Extension 2"}')
# Mock nodriver.start to return a mock browser # type: ignore[attr-defined]
# Mock nodriver.start to return a mock browser
mock_browser = AsyncMock()
mock_browser.websocket_url = "ws://localhost:9222"
monkeypatch.setattr(nodriver, "start", AsyncMock(return_value = mock_browser))
@@ -644,7 +888,7 @@ class TestWebScrapingBrowserConfiguration:
await scraper.create_browser_session()
# Verify extensions were loaded
config = cast(Mock, nodriver.start).call_args[0][0] # type: ignore[attr-defined]
config = _nodriver_start_mock().call_args[0][0]
assert len(config._extensions) == 2
for ext_path in config._extensions:
assert os.path.exists(ext_path)
@@ -713,7 +957,7 @@ class TestWebScrapingBrowserConfiguration:
def add_extension(self, ext:str) -> None:
self._extensions.append(ext)
# Mock nodriver.start to return a mock browser # type: ignore[attr-defined]
# Mock nodriver.start to return a mock browser
mock_browser = AsyncMock()
mock_browser.websocket_url = "ws://localhost:9222"
monkeypatch.setattr(nodriver, "start", AsyncMock(return_value = mock_browser))
@@ -772,7 +1016,7 @@ class TestWebScrapingBrowserConfiguration:
temp_file = tmp_path / "temp_resource"
temp_file.write_text("test")
# Mock nodriver.start to raise an exception # type: ignore[attr-defined]
# Mock nodriver.start to raise an exception
async def mock_start_fail(*args:object, **kwargs:object) -> NoReturn:
if temp_file.exists():
temp_file.unlink()
@@ -801,7 +1045,7 @@ class TestWebScrapingBrowserConfiguration:
assert scraper.browser is None
assert scraper.page is None
# Now patch nodriver.start to return a new mock browser each time # type: ignore[attr-defined]
# Now patch nodriver.start to return a new mock browser each time
mock_browser = make_mock_browser()
mock_page = TrulyAwaitableMockPage()
mock_browser.get = AsyncMock(return_value = mock_page)
@@ -1445,6 +1689,46 @@ class TestWebScrapingDiagnostics:
# Should not raise any exceptions
web_scraper.diagnose_browser_issues()
def test_diagnose_browser_issues_handles_per_process_errors(
self, scraper_with_config:WebScrapingMixin, caplog:pytest.LogCaptureFixture
) -> None:
"""diagnose_browser_issues should ignore psutil errors raised per process."""
caplog.set_level(logging.INFO)
class FailingProcess:
@property
def info(self) -> dict[str, object]:
raise psutil.AccessDenied(pid = 999)
with patch("os.path.exists", return_value = True), \
patch("os.access", return_value = True), \
patch("psutil.process_iter", return_value = [FailingProcess()]), \
patch("platform.system", return_value = "Linux"), \
patch("kleinanzeigen_bot.utils.web_scraping_mixin._is_admin", return_value = False), \
patch.object(scraper_with_config, "_diagnose_chrome_version_issues"):
scraper_with_config.browser_config.binary_location = "/usr/bin/chrome"
scraper_with_config.diagnose_browser_issues()
assert "(info) No browser processes currently running" in caplog.text
def test_diagnose_browser_issues_handles_global_psutil_failure(
self, scraper_with_config:WebScrapingMixin, caplog:pytest.LogCaptureFixture
) -> None:
"""diagnose_browser_issues should log a warning if psutil.process_iter fails entirely."""
caplog.set_level(logging.WARNING)
with patch("os.path.exists", return_value = True), \
patch("os.access", return_value = True), \
patch("psutil.process_iter", side_effect = psutil.Error("boom")), \
patch("platform.system", return_value = "Linux"), \
patch("kleinanzeigen_bot.utils.web_scraping_mixin._is_admin", return_value = False), \
patch.object(scraper_with_config, "_diagnose_chrome_version_issues"):
scraper_with_config.browser_config.binary_location = "/usr/bin/chrome"
scraper_with_config.diagnose_browser_issues()
assert "(warn) Unable to inspect browser processes:" in caplog.text
@pytest.mark.asyncio
async def test_validate_chrome_version_configuration_port_open_but_api_inaccessible(
self, web_scraper:WebScrapingMixin