feat: add configurable timeouts (#673)

## ℹ️ Description
- Related issues: #671, #658
- Introduces configurable timeout controls plus retry/backoff handling
for flaky DOM operations.

We often see timeouts which are note reproducible in certain
configurations. I suspect timeout issues based on a combination of
internet speed, browser, os, age of the computer and the weather.

This PR introduces a comprehensive config model to tweak timeouts.

## 📋 Changes Summary
- add TimeoutConfig to the main config/schema and expose timeouts in
README/docs
- wire WebScrapingMixin, extractor, update checker, and browser
diagnostics to honor the configurable timeouts and retries
- update translations/tests to cover the new behaviour and ensure
lint/mypy/pyright pipelines remain green

### ⚙️ Type of Change
- [ ] 🐞 Bug fix (non-breaking change which fixes an issue)
- [x]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)

##  Checklist
- [x] I have reviewed my changes to ensure they meet the project's
standards.
- [x] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [x] I have formatted the code (`pdm run format`).
- [x] I have verified that linting passes (`pdm run lint`).
- [x] I have updated documentation where necessary.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Centralized, configurable timeout system for web interactions,
detection flows, publishing, and pagination.
* Optional retry with exponential backoff for operations that time out.

* **Improvements**
* Replaced fixed wait times with dynamic timeouts throughout workflows.
  * More informative timeout-related messages and diagnostics.

* **Tests**
* New and expanded test coverage for timeout behavior, pagination,
diagnostics, and retry logic.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Jens
2025-11-13 15:08:52 +01:00
committed by GitHub
parent ac678ed888
commit a3ac27c441
16 changed files with 972 additions and 121 deletions

View File

@@ -573,8 +573,9 @@ class KleinanzeigenBot(WebScrapingMixin):
async def check_and_wait_for_captcha(self, *, is_login_page:bool = True) -> None:
try:
captcha_timeout = self._timeout("captcha_detection")
await self.web_find(By.CSS_SELECTOR,
"iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']", timeout = 2)
"iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']", timeout = captcha_timeout)
if not is_login_page and self.config.captcha.auto_restart:
LOG.warning("Captcha recognized - auto-restart enabled, abort run...")
@@ -624,7 +625,8 @@ class KleinanzeigenBot(WebScrapingMixin):
async def handle_after_login_logic(self) -> None:
try:
await self.web_find(By.TEXT, "Wir haben dir gerade einen 6-stelligen Code für die Telefonnummer", timeout = 4)
sms_timeout = self._timeout("sms_verification")
await self.web_find(By.TEXT, "Wir haben dir gerade einen 6-stelligen Code für die Telefonnummer", timeout = sms_timeout)
LOG.warning("############################################")
LOG.warning("# Device verification message detected. Please follow the instruction displayed in the Browser.")
LOG.warning("############################################")
@@ -634,9 +636,12 @@ class KleinanzeigenBot(WebScrapingMixin):
try:
LOG.info("Handling GDPR disclaimer...")
await self.web_find(By.ID, "gdpr-banner-accept", timeout = 10)
gdpr_timeout = self._timeout("gdpr_prompt")
await self.web_find(By.ID, "gdpr-banner-accept", timeout = gdpr_timeout)
await self.web_click(By.ID, "gdpr-banner-cmp-button")
await self.web_click(By.XPATH, "//div[@id='ConsentManagementPage']//*//button//*[contains(., 'Alle ablehnen und fortfahren')]", timeout = 10)
await self.web_click(By.XPATH,
"//div[@id='ConsentManagementPage']//*//button//*[contains(., 'Alle ablehnen und fortfahren')]",
timeout = gdpr_timeout)
except TimeoutError:
pass
@@ -724,7 +729,8 @@ class KleinanzeigenBot(WebScrapingMixin):
count += 1
await self.publish_ad(ad_file, ad_cfg, ad_cfg_orig, published_ads, AdUpdateStrategy.REPLACE)
await self.web_await(self.__check_publishing_result, timeout = 5 * 60)
publish_timeout = self._timeout("publishing_result")
await self.web_await(self.__check_publishing_result, timeout = publish_timeout)
if self.config.publishing.delete_old_ads == "AFTER_PUBLISH" and not self.keep_old_ads:
await self.delete_ad(ad_cfg, published_ads, delete_old_ads_by_title = False)
@@ -924,7 +930,8 @@ class KleinanzeigenBot(WebScrapingMixin):
# wait for payment form if commercial account is used
#############################
try:
await self.web_find(By.ID, "myftr-shppngcrt-frm", timeout = 2)
short_timeout = self._timeout("quick_dom")
await self.web_find(By.ID, "myftr-shppngcrt-frm", timeout = short_timeout)
LOG.warning("############################################")
LOG.warning("# Payment form detected! Please proceed with payment.")
@@ -934,7 +941,8 @@ class KleinanzeigenBot(WebScrapingMixin):
except TimeoutError:
pass
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
confirmation_timeout = self._timeout("publishing_confirmation")
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = confirmation_timeout)
# extract the ad id from the URL's query parameter
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
@@ -986,7 +994,8 @@ class KleinanzeigenBot(WebScrapingMixin):
count += 1
await self.publish_ad(ad_file, ad_cfg, ad_cfg_orig, published_ads, AdUpdateStrategy.MODIFY)
await self.web_await(self.__check_publishing_result, timeout = 5 * 60)
publish_timeout = self._timeout("publishing_result")
await self.web_await(self.__check_publishing_result, timeout = publish_timeout)
LOG.info("############################################")
LOG.info("DONE: updated %s", pluralize("ad", count))
@@ -1080,6 +1089,7 @@ class KleinanzeigenBot(WebScrapingMixin):
LOG.debug("Successfully set attribute field [%s] to [%s]...", special_attribute_key, special_attribute_value_str)
async def __set_shipping(self, ad_cfg:Ad, mode:AdUpdateStrategy = AdUpdateStrategy.REPLACE) -> None:
short_timeout = self._timeout("quick_dom")
if ad_cfg.shipping_type == "PICKUP":
try:
await self.web_click(By.ID, "radio-pickup")
@@ -1091,7 +1101,7 @@ class KleinanzeigenBot(WebScrapingMixin):
if mode == AdUpdateStrategy.MODIFY:
try:
# when "Andere Versandmethoden" is not available, go back and start over new
await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]', timeout = 2)
await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]', timeout = short_timeout)
except TimeoutError:
await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]')
@@ -1120,7 +1130,7 @@ class KleinanzeigenBot(WebScrapingMixin):
# (important for mode = UPDATE)
await self.web_find(By.XPATH,
'//input[contains(@placeholder, "Versandkosten (optional)")]',
timeout = 2)
timeout = short_timeout)
except TimeoutError:
await self.web_click(By.XPATH, '//*[contains(@id, "INDIVIDUAL") and contains(@data-testid, "Individueller Versand")]')

View File

@@ -33,7 +33,7 @@ class AdExtractor(WebScrapingMixin):
def __init__(self, browser:Browser, config:Config) -> None:
super().__init__()
self.browser = browser
self.config = config
self.config:Config = config
async def download_ad(self, ad_id:int) -> None:
"""
@@ -146,9 +146,10 @@ class AdExtractor(WebScrapingMixin):
# --- Pagination handling ---
multi_page = False
pagination_timeout = self._timeout("pagination_initial")
try:
# Correct selector: Use uppercase '.Pagination'
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 10) # Increased timeout slightly
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout) # Increased timeout slightly
# Correct selector: Use 'aria-label'
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
@@ -204,9 +205,10 @@ class AdExtractor(WebScrapingMixin):
break
# --- Navigate to next page ---
follow_up_timeout = self._timeout("pagination_follow_up")
try:
# Find the pagination section again (scope might have changed after scroll/wait)
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 5)
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
next_button_element = None
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
@@ -432,8 +434,19 @@ class AdExtractor(WebScrapingMixin):
# Fallback to legacy selectors in case the breadcrumb structure is unexpected.
LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids)
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
fallback_timeout = self._effective_timeout()
try:
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
except TimeoutError as exc:
LOG.error(
"Legacy breadcrumb selectors not found within %.1f seconds (collected ids: %s)",
fallback_timeout,
category_ids
)
raise TimeoutError(
_("Unable to locate breadcrumb fallback selectors within %(seconds).1f seconds.") % {"seconds": fallback_timeout}
) from exc
href_first:str = str(category_first_part.attrs["href"])
href_second:str = str(category_second_part.attrs["href"])
cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1]

View File

@@ -114,6 +114,55 @@ class CaptchaConfig(ContextualModel):
restart_delay:str = "6h"
class TimeoutConfig(ContextualModel):
multiplier:float = Field(
default = 1.0,
ge = 0.1,
description = "Global multiplier applied to all timeout values."
)
default:float = Field(default = 5.0, ge = 0.0, description = "Baseline timeout for DOM interactions.")
page_load:float = Field(default = 15.0, ge = 1.0, description = "Page load timeout for web_open.")
captcha_detection:float = Field(default = 2.0, ge = 0.1, description = "Timeout for captcha iframe detection.")
sms_verification:float = Field(default = 4.0, ge = 0.1, description = "Timeout for SMS verification prompts.")
gdpr_prompt:float = Field(default = 10.0, ge = 1.0, description = "Timeout for GDPR/consent dialogs.")
publishing_result:float = Field(default = 300.0, ge = 10.0, description = "Timeout for publishing result checks.")
publishing_confirmation:float = Field(default = 20.0, ge = 1.0, description = "Timeout for publish confirmation redirect.")
pagination_initial:float = Field(default = 10.0, ge = 1.0, description = "Timeout for initial pagination lookup.")
pagination_follow_up:float = Field(default = 5.0, ge = 1.0, description = "Timeout for subsequent pagination navigation.")
quick_dom:float = Field(default = 2.0, ge = 0.1, description = "Generic short timeout for transient UI.")
update_check:float = Field(default = 10.0, ge = 1.0, description = "Timeout for GitHub update checks.")
chrome_remote_probe:float = Field(default = 2.0, ge = 0.1, description = "Timeout for local remote-debugging probes.")
chrome_remote_debugging:float = Field(default = 5.0, ge = 1.0, description = "Timeout for remote debugging API calls.")
chrome_binary_detection:float = Field(default = 10.0, ge = 1.0, description = "Timeout for chrome --version subprocesses.")
retry_enabled:bool = Field(default = True, description = "Enable built-in retry/backoff for DOM operations.")
retry_max_attempts:int = Field(default = 2, ge = 1, description = "Max retry attempts when retry is enabled.")
retry_backoff_factor:float = Field(default = 1.5, ge = 1.0, description = "Exponential factor applied per retry attempt.")
def resolve(self, key:str = "default", override:float | None = None) -> float:
"""
Return the base timeout (seconds) for the given key without applying modifiers.
"""
if override is not None:
return float(override)
if key == "default":
return float(self.default)
attr = getattr(self, key, None)
if isinstance(attr, (int, float)):
return float(attr)
return float(self.default)
def effective(self, key:str = "default", override:float | None = None, *, attempt:int = 0) -> float:
"""
Return the effective timeout (seconds) with multiplier/backoff applied.
"""
base = self.resolve(key, override)
backoff = self.retry_backoff_factor ** attempt if attempt > 0 else 1.0
return base * self.multiplier * backoff
def _validate_glob_pattern(v:str) -> str:
if not v.strip():
raise ValueError("must be a non-empty, non-blank glob pattern")
@@ -154,6 +203,7 @@ Example:
login:LoginConfig = Field(default_factory = LoginConfig.model_construct, description = "Login credentials")
captcha:CaptchaConfig = Field(default_factory = CaptchaConfig)
update_check:UpdateCheckConfig = Field(default_factory = UpdateCheckConfig, description = "Update check configuration")
timeouts:TimeoutConfig = Field(default_factory = TimeoutConfig, description = "Centralized timeout configuration.")
def with_values(self, values:dict[str, Any]) -> Config:
return Config.model_validate(

View File

@@ -219,6 +219,8 @@ kleinanzeigen_bot/extract.py:
_extract_category_from_ad_page:
"Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s"
"Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s"
"Legacy breadcrumb selectors not found within %.1f seconds (collected ids: %s)": "Ältere Breadcrumb-Selektoren nicht innerhalb von %.1f Sekunden gefunden (gesammelte IDs: %s)"
"Unable to locate breadcrumb fallback selectors within %(seconds).1f seconds.": "Ältere Breadcrumb-Selektoren konnten nicht innerhalb von %(seconds).1f Sekunden gefunden werden."
#################################################
kleinanzeigen_bot/utils/i18n.py:
@@ -398,11 +400,6 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
web_check:
"Unsupported attribute: %s": "Nicht unterstütztes Attribut: %s"
web_find:
"Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s"
web_find_all:
"Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s"
close_browser_session:
"Closing Browser session...": "Schließe Browser-Sitzung..."
@@ -417,6 +414,12 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
web_request:
" -> HTTP %s [%s]...": " -> HTTP %s [%s]..."
_web_find_once:
"Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s"
_web_find_all_once:
"Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s"
diagnose_browser_issues:
"=== Browser Connection Diagnostics ===": "=== Browser-Verbindungsdiagnose ==="
"=== End Diagnostics ===": "=== Ende der Diagnose ==="
@@ -434,6 +437,8 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
"(info) Remote debugging port configured: %d": "(Info) Remote-Debugging-Port konfiguriert: %d"
"(info) Remote debugging port is not open": "(Info) Remote-Debugging-Port ist nicht offen"
"(warn) Unable to inspect browser processes: %s": "(Warnung) Browser-Prozesse konnten nicht überprüft werden: %s"
"(info) No browser processes currently running": "(Info) Derzeit keine Browser-Prozesse aktiv"
"(fail) Running as root - this can cause browser issues": "(Fehler) Läuft als Root - dies kann Browser-Probleme verursachen"

View File

@@ -49,6 +49,10 @@ class UpdateChecker:
"""
return __version__
def _request_timeout(self) -> float:
"""Return the effective timeout for HTTP calls."""
return self.config.timeouts.effective("update_check")
def _get_commit_hash(self, version:str) -> str | None:
"""Extract the commit hash from a version string.
@@ -74,7 +78,7 @@ class UpdateChecker:
try:
response = requests.get(
f"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases/tags/{tag_name}",
timeout = 10
timeout = self._request_timeout()
)
response.raise_for_status()
data = response.json()
@@ -97,7 +101,7 @@ class UpdateChecker:
try:
response = requests.get(
f"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/commits/{commit}",
timeout = 10
timeout = self._request_timeout()
)
response.raise_for_status()
data = response.json()
@@ -148,7 +152,7 @@ class UpdateChecker:
# Use /releases/latest endpoint for stable releases
response = requests.get(
"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases/latest",
timeout = 10
timeout = self._request_timeout()
)
response.raise_for_status()
release = response.json()
@@ -160,7 +164,7 @@ class UpdateChecker:
# Use /releases endpoint and select the most recent prerelease
response = requests.get(
"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases",
timeout = 10
timeout = self._request_timeout()
)
response.raise_for_status()
releases = response.json()

View File

@@ -78,23 +78,25 @@ def _normalize_browser_name(browser_name:str) -> str:
return "Chrome"
def detect_chrome_version_from_binary(binary_path:str) -> ChromeVersionInfo | None:
def detect_chrome_version_from_binary(binary_path:str, *, timeout:float | None = None) -> ChromeVersionInfo | None:
"""
Detect Chrome version by running the browser binary.
Args:
binary_path: Path to the Chrome binary
timeout: Optional timeout (seconds) for the subprocess call
Returns:
ChromeVersionInfo if successful, None if detection fails
"""
effective_timeout = timeout if timeout is not None else 10.0
try:
# Run browser with --version flag
result = subprocess.run( # noqa: S603
[binary_path, "--version"],
check = False, capture_output = True,
text = True,
timeout = 10
timeout = effective_timeout
)
if result.returncode != 0:
@@ -114,28 +116,30 @@ def detect_chrome_version_from_binary(binary_path:str) -> ChromeVersionInfo | No
return ChromeVersionInfo(version_string, major_version, browser_name)
except subprocess.TimeoutExpired:
LOG.debug("Browser version command timed out")
LOG.debug("Browser version command timed out after %.1fs", effective_timeout)
return None
except (subprocess.SubprocessError, ValueError) as e:
LOG.debug("Failed to detect browser version: %s", str(e))
return None
def detect_chrome_version_from_remote_debugging(host:str = "127.0.0.1", port:int = 9222) -> ChromeVersionInfo | None:
def detect_chrome_version_from_remote_debugging(host:str = "127.0.0.1", port:int = 9222, *, timeout:float | None = None) -> ChromeVersionInfo | None:
"""
Detect Chrome version from remote debugging API.
Args:
host: Remote debugging host
port: Remote debugging port
timeout: Optional timeout (seconds) for the HTTP request
Returns:
ChromeVersionInfo if successful, None if detection fails
"""
effective_timeout = timeout if timeout is not None else 5.0
try:
# Query the remote debugging API
url = f"http://{host}:{port}/json/version"
response = urllib.request.urlopen(url, timeout = 5) # noqa: S310
response = urllib.request.urlopen(url, timeout = effective_timeout) # noqa: S310
version_data = json.loads(response.read().decode())
# Extract version information
@@ -200,7 +204,10 @@ def validate_chrome_136_configuration(browser_arguments:list[str], user_data_dir
def get_chrome_version_diagnostic_info(
binary_path:str | None = None,
remote_host:str = "127.0.0.1",
remote_port:int | None = None
remote_port:int | None = None,
*,
remote_timeout:float | None = None,
binary_timeout:float | None = None
) -> dict[str, Any]:
"""
Get comprehensive Chrome version diagnostic information.
@@ -209,6 +216,8 @@ def get_chrome_version_diagnostic_info(
binary_path: Path to Chrome binary (optional)
remote_host: Remote debugging host
remote_port: Remote debugging port (optional)
remote_timeout: Timeout for remote debugging detection
binary_timeout: Timeout for binary detection
Returns:
Dictionary with diagnostic information
@@ -223,7 +232,7 @@ def get_chrome_version_diagnostic_info(
# Try binary detection
if binary_path:
version_info = detect_chrome_version_from_binary(binary_path)
version_info = detect_chrome_version_from_binary(binary_path, timeout = binary_timeout)
if version_info:
diagnostic_info["binary_detection"] = {
"version_string": version_info.version_string,
@@ -235,7 +244,7 @@ def get_chrome_version_diagnostic_info(
# Try remote debugging detection
if remote_port:
version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port)
version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port, timeout = remote_timeout)
if version_info:
diagnostic_info["remote_detection"] = {
"version_string": version_info.version_string,

View File

@@ -2,9 +2,9 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import asyncio, enum, inspect, json, os, platform, secrets, shutil, subprocess, urllib.request # isort: skip # noqa: S404
from collections.abc import Callable, Coroutine, Iterable
from collections.abc import Awaitable, Callable, Coroutine, Iterable
from gettext import gettext as _
from typing import Any, Final, cast
from typing import Any, Final, Optional, cast
try:
from typing import Never # type: ignore[attr-defined,unused-ignore] # mypy
@@ -15,10 +15,13 @@ import nodriver, psutil # isort: skip
from typing import TYPE_CHECKING, TypeGuard
from nodriver.core.browser import Browser
from nodriver.core.config import Config
from nodriver.core.config import Config as NodriverConfig
from nodriver.core.element import Element
from nodriver.core.tab import Tab as Page
from kleinanzeigen_bot.model.config_model import Config as BotConfig
from kleinanzeigen_bot.model.config_model import TimeoutConfig
from . import loggers, net
from .chrome_version_detector import (
ChromeVersionInfo,
@@ -32,6 +35,7 @@ from .misc import T, ensure
if TYPE_CHECKING:
from nodriver.cdp.runtime import RemoteObject
# Constants for RemoteObject conversion
_KEY_VALUE_PAIR_SIZE = 2
@@ -102,6 +106,69 @@ class WebScrapingMixin:
self.browser_config:Final[BrowserConfig] = BrowserConfig()
self.browser:Browser = None # pyright: ignore[reportAttributeAccessIssue]
self.page:Page = None # pyright: ignore[reportAttributeAccessIssue]
self._default_timeout_config:TimeoutConfig | None = None
self.config:BotConfig = cast(BotConfig, None)
def _get_timeout_config(self) -> TimeoutConfig:
config = getattr(self, "config", None)
timeouts:TimeoutConfig | None = None
if config is not None:
timeouts = cast(Optional[TimeoutConfig], getattr(config, "timeouts", None))
if timeouts is not None:
return timeouts
if self._default_timeout_config is None:
self._default_timeout_config = TimeoutConfig()
return self._default_timeout_config
def _timeout(self, key:str = "default", override:float | None = None) -> float:
"""
Return the base timeout (seconds) for a given key without applying multipliers.
"""
return self._get_timeout_config().resolve(key, override)
def _effective_timeout(self, key:str = "default", override:float | None = None, *, attempt:int = 0) -> float:
"""
Return the effective timeout (seconds) with multiplier/backoff applied.
"""
return self._get_timeout_config().effective(key, override, attempt = attempt)
def _timeout_attempts(self) -> int:
cfg = self._get_timeout_config()
if not cfg.retry_enabled:
return 1
# Always perform the initial attempt plus the configured number of retries.
return 1 + cfg.retry_max_attempts
async def _run_with_timeout_retries(
self,
operation:Callable[[float], Awaitable[T]],
*,
description:str,
key:str = "default",
override:float | None = None
) -> T:
"""
Execute an async callable with retry/backoff handling for TimeoutError.
"""
attempts = self._timeout_attempts()
for attempt in range(attempts):
effective_timeout = self._effective_timeout(key, override, attempt = attempt)
try:
return await operation(effective_timeout)
except TimeoutError:
if attempt >= attempts - 1:
raise
LOG.debug(
"Retrying %s after TimeoutError (attempt %d/%d, timeout %.1fs)",
description,
attempt + 1,
attempts,
effective_timeout
)
raise TimeoutError(f"{description} failed without executing operation")
async def create_browser_session(self) -> None:
LOG.info("Creating Browser session...")
@@ -137,7 +204,7 @@ class WebScrapingMixin:
f"Make sure the browser is running and the port is not blocked by firewall.")
try:
cfg = Config(
cfg = NodriverConfig(
browser_executable_path = self.browser_config.binary_location # actually not necessary but nodriver fails without
)
cfg.host = remote_host
@@ -207,7 +274,7 @@ class WebScrapingMixin:
if self.browser_config.user_data_dir:
LOG.info(" -> Browser user data dir: %s", self.browser_config.user_data_dir)
cfg = Config(
cfg = NodriverConfig(
headless = False,
browser_executable_path = self.browser_config.binary_location,
browser_args = browser_args,
@@ -355,7 +422,8 @@ class WebScrapingMixin:
LOG.info("(ok) Remote debugging port is open")
# Try to get more information about the debugging endpoint
try:
response = urllib.request.urlopen(f"http://127.0.0.1:{remote_port}/json/version", timeout = 2)
probe_timeout = self._effective_timeout("chrome_remote_probe")
response = urllib.request.urlopen(f"http://127.0.0.1:{remote_port}/json/version", timeout = probe_timeout)
version_info = json.loads(response.read().decode())
LOG.info("(ok) Remote debugging API accessible - Browser: %s", version_info.get("Browser", "Unknown"))
except Exception as e:
@@ -378,30 +446,34 @@ class WebScrapingMixin:
except (AssertionError, TypeError):
target_browser_name = ""
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
try:
proc_name = proc.info["name"] or ""
cmdline = proc.info["cmdline"] or []
try:
for proc in psutil.process_iter(["pid", "name", "cmdline"]):
try:
proc_name = proc.info["name"] or ""
cmdline = proc.info["cmdline"] or []
# Check if this is a browser process relevant to our diagnostics
is_relevant_browser = False
# Check if this is a browser process relevant to our diagnostics
is_relevant_browser = False
# Is this the target browser?
is_target_browser = target_browser_name and target_browser_name in proc_name.lower()
# Is this the target browser?
is_target_browser = target_browser_name and target_browser_name in proc_name.lower()
# Does it have remote debugging?
has_remote_debugging = cmdline and any(arg.startswith("--remote-debugging-port=") for arg in cmdline)
# Does it have remote debugging?
has_remote_debugging = cmdline and any(arg.startswith("--remote-debugging-port=") for arg in cmdline)
# Detect target browser processes for diagnostics
if is_target_browser:
is_relevant_browser = True
# Add debugging status to the process info for better diagnostics
proc.info["has_remote_debugging"] = has_remote_debugging
# Detect target browser processes for diagnostics
if is_target_browser:
is_relevant_browser = True
# Add debugging status to the process info for better diagnostics
proc.info["has_remote_debugging"] = has_remote_debugging
if is_relevant_browser:
browser_processes.append(proc.info)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
if is_relevant_browser:
browser_processes.append(proc.info)
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
except (psutil.Error, PermissionError) as exc:
LOG.warning("(warn) Unable to inspect browser processes: %s", exc)
browser_processes = []
if browser_processes:
LOG.info("(info) Found %d browser processes running", len(browser_processes))
@@ -486,15 +558,17 @@ class WebScrapingMixin:
raise AssertionError(_("Installed browser could not be detected"))
async def web_await(self, condition:Callable[[], T | Never | Coroutine[Any, Any, T | Never]], *,
timeout:int | float = 5, timeout_error_message:str = "") -> T:
timeout:int | float | None = None, timeout_error_message:str = "", apply_multiplier:bool = True) -> T:
"""
Blocks/waits until the given condition is met.
:param timeout: timeout in seconds
:param timeout: timeout in seconds (base value, multiplier applied unless disabled)
:raises TimeoutError: if element could not be found within time
"""
loop = asyncio.get_running_loop()
start_at = loop.time()
base_timeout = timeout if timeout is not None else self._timeout()
effective_timeout = self._effective_timeout(override = base_timeout) if apply_multiplier else base_timeout
while True:
await self.page
@@ -506,13 +580,13 @@ class WebScrapingMixin:
return result
except Exception as ex1:
ex = ex1
if loop.time() - start_at > timeout:
if loop.time() - start_at > effective_timeout:
if ex:
raise ex
raise TimeoutError(timeout_error_message or f"Condition not met within {timeout} seconds")
raise TimeoutError(timeout_error_message or f"Condition not met within {effective_timeout} seconds")
await self.page.sleep(0.5)
async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float = 5) -> bool:
async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float | None = None) -> bool:
"""
Locates an HTML element and returns a state.
@@ -559,7 +633,7 @@ class WebScrapingMixin:
"""))
raise AssertionError(_("Unsupported attribute: %s") % attr)
async def web_click(self, selector_type:By, selector_value:str, *, timeout:int | float = 5) -> Element:
async def web_click(self, selector_type:By, selector_value:str, *, timeout:int | float | None = None) -> Element:
"""
Locates an HTML element by ID.
@@ -652,91 +726,130 @@ class WebScrapingMixin:
# Return primitive values as-is
return data
async def web_find(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> Element:
async def web_find(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element:
"""
Locates an HTML element by the given selector type and value.
:param timeout: timeout in seconds
:param timeout: timeout in seconds (base value before multiplier/backoff)
:raises TimeoutError: if element could not be found within time
"""
async def attempt(effective_timeout:float) -> Element:
return await self._web_find_once(selector_type, selector_value, effective_timeout, parent = parent)
return await self._run_with_timeout_retries(
attempt,
description = f"web_find({selector_type.name}, {selector_value})",
key = "default",
override = timeout
)
async def web_find_all(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> list[Element]:
"""
Locates multiple HTML elements by the given selector type and value.
:param timeout: timeout in seconds (base value before multiplier/backoff)
:raises TimeoutError: if element could not be found within time
"""
async def attempt(effective_timeout:float) -> list[Element]:
return await self._web_find_all_once(selector_type, selector_value, effective_timeout, parent = parent)
return await self._run_with_timeout_retries(
attempt,
description = f"web_find_all({selector_type.name}, {selector_value})",
key = "default",
override = timeout
)
async def _web_find_once(self, selector_type:By, selector_value:str, timeout:float, *, parent:Element | None = None) -> Element:
timeout_suffix = f" within {timeout} seconds."
match selector_type:
case By.ID:
escaped_id = selector_value.translate(METACHAR_ESCAPER)
return await self.web_await(
lambda: self.page.query_selector(f"#{escaped_id}", parent),
timeout = timeout,
timeout_error_message = f"No HTML element found with ID '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML element found with ID '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.CLASS_NAME:
escaped_classname = selector_value.translate(METACHAR_ESCAPER)
return await self.web_await(
lambda: self.page.query_selector(f".{escaped_classname}", parent),
timeout = timeout,
timeout_error_message = f"No HTML element found with CSS class '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML element found with CSS class '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.TAG_NAME:
return await self.web_await(
lambda: self.page.query_selector(selector_value, parent),
timeout = timeout,
timeout_error_message = f"No HTML element found of tag <{selector_value}> within {timeout} seconds.")
timeout_error_message = f"No HTML element found of tag <{selector_value}>{timeout_suffix}",
apply_multiplier = False)
case By.CSS_SELECTOR:
return await self.web_await(
lambda: self.page.query_selector(selector_value, parent),
timeout = timeout,
timeout_error_message = f"No HTML element found using CSS selector '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML element found using CSS selector '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.TEXT:
ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
return await self.web_await(
lambda: self.page.find_element_by_text(selector_value, best_match = True),
timeout = timeout,
timeout_error_message = f"No HTML element found containing text '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML element found containing text '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.XPATH:
ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
return await self.web_await(
lambda: self.page.find_element_by_text(selector_value, best_match = True),
timeout = timeout,
timeout_error_message = f"No HTML element found using XPath '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML element found using XPath '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
raise AssertionError(_("Unsupported selector type: %s") % selector_type)
async def web_find_all(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> list[Element]:
"""
Locates an HTML element by ID.
async def _web_find_all_once(self, selector_type:By, selector_value:str, timeout:float, *, parent:Element | None = None) -> list[Element]:
timeout_suffix = f" within {timeout} seconds."
:param timeout: timeout in seconds
:raises TimeoutError: if element could not be found within time
"""
match selector_type:
case By.CLASS_NAME:
escaped_classname = selector_value.translate(METACHAR_ESCAPER)
return await self.web_await(
lambda: self.page.query_selector_all(f".{escaped_classname}", parent),
timeout = timeout,
timeout_error_message = f"No HTML elements found with CSS class '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML elements found with CSS class '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.CSS_SELECTOR:
return await self.web_await(
lambda: self.page.query_selector_all(selector_value, parent),
timeout = timeout,
timeout_error_message = f"No HTML elements found using CSS selector '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML elements found using CSS selector '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.TAG_NAME:
return await self.web_await(
lambda: self.page.query_selector_all(selector_value, parent),
timeout = timeout,
timeout_error_message = f"No HTML elements found of tag <{selector_value}> within {timeout} seconds.")
timeout_error_message = f"No HTML elements found of tag <{selector_value}>{timeout_suffix}",
apply_multiplier = False)
case By.TEXT:
ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
return await self.web_await(
lambda: self.page.find_elements_by_text(selector_value),
timeout = timeout,
timeout_error_message = f"No HTML elements found containing text '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML elements found containing text '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
case By.XPATH:
ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}")
return await self.web_await(
lambda: self.page.find_elements_by_text(selector_value),
timeout = timeout,
timeout_error_message = f"No HTML elements found using XPath '{selector_value}' within {timeout} seconds.")
timeout_error_message = f"No HTML elements found using XPath '{selector_value}'{timeout_suffix}",
apply_multiplier = False)
raise AssertionError(_("Unsupported selector type: %s") % selector_type)
async def web_input(self, selector_type:By, selector_value:str, text:str | int, *, timeout:int | float = 5) -> Element:
async def web_input(self, selector_type:By, selector_value:str, text:str | int, *, timeout:int | float | None = None) -> Element:
"""
Enters text into an HTML input field.
@@ -749,10 +862,10 @@ class WebScrapingMixin:
await self.web_sleep()
return input_field
async def web_open(self, url:str, *, timeout:int | float = 15_000, reload_if_already_open:bool = False) -> None:
async def web_open(self, url:str, *, timeout:int | float | None = None, reload_if_already_open:bool = False) -> None:
"""
:param url: url to open in browser
:param timeout: timespan in seconds within the page needs to be loaded
:param timeout: timespan in seconds within the page needs to be loaded (base value)
:param reload_if_already_open: if False does nothing if the URL is already open in the browser
:raises TimeoutException: if page did not open within given timespan
"""
@@ -761,10 +874,15 @@ class WebScrapingMixin:
LOG.debug(" => skipping, [%s] is already open", url)
return
self.page = await self.browser.get(url = url, new_tab = False, new_window = False)
await self.web_await(lambda: self.web_execute("document.readyState == 'complete'"), timeout = timeout,
timeout_error_message = f"Page did not finish loading within {timeout} seconds.")
page_timeout = self._effective_timeout("page_load", timeout)
await self.web_await(
lambda: self.web_execute("document.readyState == 'complete'"),
timeout = page_timeout,
timeout_error_message = f"Page did not finish loading within {page_timeout} seconds.",
apply_multiplier = False
)
async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> str:
async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> str:
return str(await (await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)).apply("""
function (elem) {
let sel = window.getSelection()
@@ -835,7 +953,7 @@ class WebScrapingMixin:
await self.web_execute(f"window.scrollTo(0, {current_y_pos})")
await asyncio.sleep(scroll_length / scroll_speed / 2) # double speed
async def web_select(self, selector_type:By, selector_value:str, selected_value:Any, timeout:int | float = 5) -> Element:
async def web_select(self, selector_type:By, selector_value:str, selected_value:Any, timeout:int | float | None = None) -> Element:
"""
Selects an <option/> of a <select/> HTML element.
@@ -895,7 +1013,11 @@ class WebScrapingMixin:
port_available = await self._check_port_with_retry(remote_host, remote_port)
if port_available:
try:
version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port)
version_info = detect_chrome_version_from_remote_debugging(
remote_host,
remote_port,
timeout = self._effective_timeout("chrome_remote_debugging")
)
if version_info:
LOG.debug(" -> Detected version from existing browser: %s", version_info)
else:
@@ -910,7 +1032,10 @@ class WebScrapingMixin:
binary_path = self.browser_config.binary_location
if binary_path:
LOG.debug(" -> No remote browser detected, trying binary detection")
version_info = detect_chrome_version_from_binary(binary_path)
version_info = detect_chrome_version_from_binary(
binary_path,
timeout = self._effective_timeout("chrome_binary_detection")
)
# Validate if Chrome 136+ detected
if version_info and version_info.is_chrome_136_plus:
@@ -977,7 +1102,10 @@ class WebScrapingMixin:
binary_path = self.browser_config.binary_location
diagnostic_info = get_chrome_version_diagnostic_info(
binary_path = binary_path,
remote_port = remote_port if remote_port > 0 else None
remote_host = "127.0.0.1",
remote_port = remote_port if remote_port > 0 else None,
remote_timeout = self._effective_timeout("chrome_remote_debugging"),
binary_timeout = self._effective_timeout("chrome_binary_detection")
)
# Report binary detection results