From a3ac27c4412b67c998c0a970bb184ce7128d5db0 Mon Sep 17 00:00:00 2001 From: Jens <1742418+1cu@users.noreply.github.com> Date: Thu, 13 Nov 2025 15:08:52 +0100 Subject: [PATCH] feat: add configurable timeouts (#673) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## ℹ️ Description - Related issues: #671, #658 - Introduces configurable timeout controls plus retry/backoff handling for flaky DOM operations. We often see timeouts which are note reproducible in certain configurations. I suspect timeout issues based on a combination of internet speed, browser, os, age of the computer and the weather. This PR introduces a comprehensive config model to tweak timeouts. ## 📋 Changes Summary - add TimeoutConfig to the main config/schema and expose timeouts in README/docs - wire WebScrapingMixin, extractor, update checker, and browser diagnostics to honor the configurable timeouts and retries - update translations/tests to cover the new behaviour and ensure lint/mypy/pyright pipelines remain green ### ⚙️ Type of Change - [ ] 🐞 Bug fix (non-breaking change which fixes an issue) - [x] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. ## Summary by CodeRabbit * **New Features** * Centralized, configurable timeout system for web interactions, detection flows, publishing, and pagination. * Optional retry with exponential backoff for operations that time out. * **Improvements** * Replaced fixed wait times with dynamic timeouts throughout workflows. * More informative timeout-related messages and diagnostics. * **Tests** * New and expanded test coverage for timeout behavior, pagination, diagnostics, and retry logic. --- CONTRIBUTING.md | 9 +- README.md | 23 ++ docs/BROWSER_TROUBLESHOOTING.md | 12 + schemas/config.schema.json | 135 ++++++++ src/kleinanzeigen_bot/__init__.py | 30 +- src/kleinanzeigen_bot/extract.py | 23 +- src/kleinanzeigen_bot/model/config_model.py | 50 +++ .../resources/translations.de.yaml | 15 +- src/kleinanzeigen_bot/update_checker.py | 12 +- .../utils/chrome_version_detector.py | 25 +- .../utils/web_scraping_mixin.py | 248 ++++++++++---- tests/unit/test_config_model.py | 51 ++- tests/unit/test_extract.py | 72 ++++ tests/unit/test_update_checker.py | 12 + tests/unit/test_web_scraping_mixin.py | 312 +++++++++++++++++- .../test_web_scraping_mixin_chrome_version.py | 64 +++- 16 files changed, 972 insertions(+), 121 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b58d906..eb73735 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -187,6 +187,14 @@ All Python files must start with SPDX license headers: - Use appropriate log levels (DEBUG, INFO, WARNING, ERROR) - Log important state changes and decision points +#### Timeout configuration +- The default timeout (`timeouts.default`) already wraps all standard DOM helpers (`web_find`, `web_click`, etc.) via `WebScrapingMixin._timeout/_effective_timeout`. Use it unless a workflow clearly needs a different SLA. +- Reserve `timeouts.quick_dom` for transient overlays (shipping dialogs, payment prompts, toast banners) that should render almost instantly; call `self._timeout("quick_dom")` in those spots to keep the UI responsive. +- For single selectors that occasionally need more headroom, pass an inline override instead of creating a new config key, e.g. `custom = self._timeout(override = 12.5); await self.web_find(..., timeout = custom)`. +- Use `_timeout()` when you just need the raw configured value (with optional override); use `_effective_timeout()` when you rely on the global multiplier and retry backoff for a given attempt (e.g. inside `_run_with_timeout_retries`). +- Add a new timeout key only when a recurring workflow has its own timing profile (pagination, captcha detection, publishing confirmations, Chrome probes, etc.). Whenever you add one, extend `TimeoutConfig`, document it in the sample `timeouts:` block in `README.md`, and explain it in `docs/BROWSER_TROUBLESHOOTING.md`. +- Encourage users to raise `timeouts.multiplier` when everything is slow, and override existing keys in `config.yaml` before introducing new ones. This keeps the configuration surface minimal. + #### Examples ```python def parse_duration(text: str) -> timedelta: @@ -297,4 +305,3 @@ See the [LICENSE.txt](LICENSE.txt) file for our project's licensing. All source - Use the translation system for all output—**never hardcode German or other languages** in the code. - If you add or change a user-facing message, update the translation file and ensure that translation completeness tests pass (`tests/unit/test_translations.py`). - Review the translation guidelines and patterns in the codebase for correct usage. - diff --git a/README.md b/README.md index 3899c87..f5333cb 100644 --- a/README.md +++ b/README.md @@ -277,6 +277,27 @@ categories: Verschenken & Tauschen > Verleihen: 272/274 Verschenken & Tauschen > Verschenken: 272/192 +# timeout tuning (optional) +timeouts: + multiplier: 1.0 # Scale all timeouts (e.g. 2.0 for slower networks) + default: 5.0 # Base timeout for web_find/web_click/etc. + page_load: 15.0 # Timeout for web_open page loads + captcha_detection: 2.0 # Timeout for captcha iframe detection + sms_verification: 4.0 # Timeout for SMS verification banners + gdpr_prompt: 10.0 # Timeout when handling GDPR dialogs + publishing_result: 300.0 # Timeout for publishing status checks + publishing_confirmation: 20.0 # Timeout for publish confirmation redirect + pagination_initial: 10.0 # Timeout for first pagination lookup + pagination_follow_up: 5.0 # Timeout for subsequent pagination clicks + quick_dom: 2.0 # Generic short DOM timeout (shipping dialogs, etc.) + update_check: 10.0 # Timeout for GitHub update requests + chrome_remote_probe: 2.0 # Timeout for local remote-debugging probes + chrome_remote_debugging: 5.0 # Timeout for remote debugging API calls + chrome_binary_detection: 10.0 # Timeout for chrome --version subprocess + retry_enabled: true # Enables DOM retry/backoff when timeouts occur + retry_max_attempts: 2 + retry_backoff_factor: 1.5 + # download configuration download: include_all_matching_shipping_options: false # if true, all shipping options matching the package size will be included @@ -329,6 +350,8 @@ login: password: "" ``` +Slow networks or sluggish remote browsers often just need a higher `timeouts.multiplier`, while truly problematic selectors can get explicit values directly under `timeouts`. Remember to regenerate the schemas after changing the configuration model so editors stay in sync. + ### 2) Ad configuration Each ad is described in a separate JSON or YAML file with prefix `ad_`. The prefix is configurable in config file. diff --git a/docs/BROWSER_TROUBLESHOOTING.md b/docs/BROWSER_TROUBLESHOOTING.md index 9ddc328..ba977d3 100644 --- a/docs/BROWSER_TROUBLESHOOTING.md +++ b/docs/BROWSER_TROUBLESHOOTING.md @@ -59,6 +59,18 @@ Please update your configuration to include --user-data-dir for remote debugging The bot will also provide specific instructions on how to fix your configuration. +### Issue: Slow page loads or recurring TimeoutError + +**Symptoms:** +- `_extract_category_from_ad_page` fails intermittently due to breadcrumb lookups timing out +- Captcha/SMS/GDPR prompts appear right after a timeout +- Requests to GitHub's API fail sporadically with timeout errors + +**Solutions:** +1. Increase `timeouts.multiplier` in `config.yaml` (e.g. `2.0` doubles every timeout consistently). +2. Override specific keys under `timeouts` (e.g. `pagination_initial: 20.0`) if only a single selector is problematic. +3. Keep `retry_enabled` on so that DOM lookups are retried with exponential backoff. + ## Common Issues and Solutions ### Issue 1: "Failed to connect to browser" with "root" error diff --git a/schemas/config.schema.json b/schemas/config.schema.json index 5ff1943..7f288b7 100644 --- a/schemas/config.schema.json +++ b/schemas/config.schema.json @@ -359,6 +359,137 @@ "title": "PublishingConfig", "type": "object" }, + "TimeoutConfig": { + "properties": { + "multiplier": { + "default": 1.0, + "description": "Global multiplier applied to all timeout values.", + "minimum": 0.1, + "title": "Multiplier", + "type": "number" + }, + "default": { + "type": "number", + "minimum": 0.0, + "default": 5.0, + "description": "Baseline timeout for DOM interactions.", + "title": "Default" + }, + "page_load": { + "default": 15.0, + "description": "Page load timeout for web_open.", + "minimum": 1.0, + "title": "Page Load", + "type": "number" + }, + "captcha_detection": { + "default": 2.0, + "description": "Timeout for captcha iframe detection.", + "minimum": 0.1, + "title": "Captcha Detection", + "type": "number" + }, + "sms_verification": { + "default": 4.0, + "description": "Timeout for SMS verification prompts.", + "minimum": 0.1, + "title": "Sms Verification", + "type": "number" + }, + "gdpr_prompt": { + "default": 10.0, + "description": "Timeout for GDPR/consent dialogs.", + "minimum": 1.0, + "title": "Gdpr Prompt", + "type": "number" + }, + "publishing_result": { + "default": 300.0, + "description": "Timeout for publishing result checks.", + "minimum": 10.0, + "title": "Publishing Result", + "type": "number" + }, + "publishing_confirmation": { + "default": 20.0, + "description": "Timeout for publish confirmation redirect.", + "minimum": 1.0, + "title": "Publishing Confirmation", + "type": "number" + }, + "pagination_initial": { + "default": 10.0, + "description": "Timeout for initial pagination lookup.", + "minimum": 1.0, + "title": "Pagination Initial", + "type": "number" + }, + "pagination_follow_up": { + "default": 5.0, + "description": "Timeout for subsequent pagination navigation.", + "minimum": 1.0, + "title": "Pagination Follow Up", + "type": "number" + }, + "quick_dom": { + "default": 2.0, + "description": "Generic short timeout for transient UI.", + "minimum": 0.1, + "title": "Quick Dom", + "type": "number" + }, + "update_check": { + "default": 10.0, + "description": "Timeout for GitHub update checks.", + "minimum": 1.0, + "title": "Update Check", + "type": "number" + }, + "chrome_remote_probe": { + "default": 2.0, + "description": "Timeout for local remote-debugging probes.", + "minimum": 0.1, + "title": "Chrome Remote Probe", + "type": "number" + }, + "chrome_remote_debugging": { + "default": 5.0, + "description": "Timeout for remote debugging API calls.", + "minimum": 1.0, + "title": "Chrome Remote Debugging", + "type": "number" + }, + "chrome_binary_detection": { + "default": 10.0, + "description": "Timeout for chrome --version subprocesses.", + "minimum": 1.0, + "title": "Chrome Binary Detection", + "type": "number" + }, + "retry_enabled": { + "default": true, + "description": "Enable built-in retry/backoff for DOM operations.", + "title": "Retry Enabled", + "type": "boolean" + }, + "retry_max_attempts": { + "default": 2, + "description": "Max retry attempts when retry is enabled.", + "minimum": 1, + "title": "Retry Max Attempts", + "type": "integer" + }, + "retry_backoff_factor": { + "default": 1.5, + "description": "Exponential factor applied per retry attempt.", + "minimum": 1.0, + "title": "Retry Backoff Factor", + "type": "number" + } + }, + "title": "TimeoutConfig", + "type": "object" + }, "UpdateCheckConfig": { "description": "Configuration for update checking functionality.\n\nAttributes:\n enabled: Whether update checking is enabled.\n channel: Which release channel to check ('latest' for stable, 'preview' for prereleases).\n interval: How often to check for updates (e.g. '7d', '1d').\n If the interval is invalid, too short (<1d), or too long (>30d),\n the bot will log a warning and use a default interval for this run:\n - 1d for 'preview' channel\n - 7d for 'latest' channel\n The config file is not changed automatically; please fix your config to avoid repeated warnings.", "properties": { @@ -428,6 +559,10 @@ "update_check": { "$ref": "#/$defs/UpdateCheckConfig", "description": "Update check configuration" + }, + "timeouts": { + "$ref": "#/$defs/TimeoutConfig", + "description": "Centralized timeout configuration." } }, "title": "Config", diff --git a/src/kleinanzeigen_bot/__init__.py b/src/kleinanzeigen_bot/__init__.py index 8fa8209..8d632fd 100644 --- a/src/kleinanzeigen_bot/__init__.py +++ b/src/kleinanzeigen_bot/__init__.py @@ -573,8 +573,9 @@ class KleinanzeigenBot(WebScrapingMixin): async def check_and_wait_for_captcha(self, *, is_login_page:bool = True) -> None: try: + captcha_timeout = self._timeout("captcha_detection") await self.web_find(By.CSS_SELECTOR, - "iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']", timeout = 2) + "iframe[name^='a-'][src^='https://www.google.com/recaptcha/api2/anchor?']", timeout = captcha_timeout) if not is_login_page and self.config.captcha.auto_restart: LOG.warning("Captcha recognized - auto-restart enabled, abort run...") @@ -624,7 +625,8 @@ class KleinanzeigenBot(WebScrapingMixin): async def handle_after_login_logic(self) -> None: try: - await self.web_find(By.TEXT, "Wir haben dir gerade einen 6-stelligen Code für die Telefonnummer", timeout = 4) + sms_timeout = self._timeout("sms_verification") + await self.web_find(By.TEXT, "Wir haben dir gerade einen 6-stelligen Code für die Telefonnummer", timeout = sms_timeout) LOG.warning("############################################") LOG.warning("# Device verification message detected. Please follow the instruction displayed in the Browser.") LOG.warning("############################################") @@ -634,9 +636,12 @@ class KleinanzeigenBot(WebScrapingMixin): try: LOG.info("Handling GDPR disclaimer...") - await self.web_find(By.ID, "gdpr-banner-accept", timeout = 10) + gdpr_timeout = self._timeout("gdpr_prompt") + await self.web_find(By.ID, "gdpr-banner-accept", timeout = gdpr_timeout) await self.web_click(By.ID, "gdpr-banner-cmp-button") - await self.web_click(By.XPATH, "//div[@id='ConsentManagementPage']//*//button//*[contains(., 'Alle ablehnen und fortfahren')]", timeout = 10) + await self.web_click(By.XPATH, + "//div[@id='ConsentManagementPage']//*//button//*[contains(., 'Alle ablehnen und fortfahren')]", + timeout = gdpr_timeout) except TimeoutError: pass @@ -724,7 +729,8 @@ class KleinanzeigenBot(WebScrapingMixin): count += 1 await self.publish_ad(ad_file, ad_cfg, ad_cfg_orig, published_ads, AdUpdateStrategy.REPLACE) - await self.web_await(self.__check_publishing_result, timeout = 5 * 60) + publish_timeout = self._timeout("publishing_result") + await self.web_await(self.__check_publishing_result, timeout = publish_timeout) if self.config.publishing.delete_old_ads == "AFTER_PUBLISH" and not self.keep_old_ads: await self.delete_ad(ad_cfg, published_ads, delete_old_ads_by_title = False) @@ -924,7 +930,8 @@ class KleinanzeigenBot(WebScrapingMixin): # wait for payment form if commercial account is used ############################# try: - await self.web_find(By.ID, "myftr-shppngcrt-frm", timeout = 2) + short_timeout = self._timeout("quick_dom") + await self.web_find(By.ID, "myftr-shppngcrt-frm", timeout = short_timeout) LOG.warning("############################################") LOG.warning("# Payment form detected! Please proceed with payment.") @@ -934,7 +941,8 @@ class KleinanzeigenBot(WebScrapingMixin): except TimeoutError: pass - await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20) + confirmation_timeout = self._timeout("publishing_confirmation") + await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = confirmation_timeout) # extract the ad id from the URL's query parameter current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query) @@ -986,7 +994,8 @@ class KleinanzeigenBot(WebScrapingMixin): count += 1 await self.publish_ad(ad_file, ad_cfg, ad_cfg_orig, published_ads, AdUpdateStrategy.MODIFY) - await self.web_await(self.__check_publishing_result, timeout = 5 * 60) + publish_timeout = self._timeout("publishing_result") + await self.web_await(self.__check_publishing_result, timeout = publish_timeout) LOG.info("############################################") LOG.info("DONE: updated %s", pluralize("ad", count)) @@ -1080,6 +1089,7 @@ class KleinanzeigenBot(WebScrapingMixin): LOG.debug("Successfully set attribute field [%s] to [%s]...", special_attribute_key, special_attribute_value_str) async def __set_shipping(self, ad_cfg:Ad, mode:AdUpdateStrategy = AdUpdateStrategy.REPLACE) -> None: + short_timeout = self._timeout("quick_dom") if ad_cfg.shipping_type == "PICKUP": try: await self.web_click(By.ID, "radio-pickup") @@ -1091,7 +1101,7 @@ class KleinanzeigenBot(WebScrapingMixin): if mode == AdUpdateStrategy.MODIFY: try: # when "Andere Versandmethoden" is not available, go back and start over new - await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]', timeout = 2) + await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]', timeout = short_timeout) except TimeoutError: await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]') @@ -1120,7 +1130,7 @@ class KleinanzeigenBot(WebScrapingMixin): # (important for mode = UPDATE) await self.web_find(By.XPATH, '//input[contains(@placeholder, "Versandkosten (optional)")]', - timeout = 2) + timeout = short_timeout) except TimeoutError: await self.web_click(By.XPATH, '//*[contains(@id, "INDIVIDUAL") and contains(@data-testid, "Individueller Versand")]') diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index f59c105..da61770 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -33,7 +33,7 @@ class AdExtractor(WebScrapingMixin): def __init__(self, browser:Browser, config:Config) -> None: super().__init__() self.browser = browser - self.config = config + self.config:Config = config async def download_ad(self, ad_id:int) -> None: """ @@ -146,9 +146,10 @@ class AdExtractor(WebScrapingMixin): # --- Pagination handling --- multi_page = False + pagination_timeout = self._timeout("pagination_initial") try: # Correct selector: Use uppercase '.Pagination' - pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 10) # Increased timeout slightly + pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout) # Increased timeout slightly # Correct selector: Use 'aria-label' # Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later) next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section) @@ -204,9 +205,10 @@ class AdExtractor(WebScrapingMixin): break # --- Navigate to next page --- + follow_up_timeout = self._timeout("pagination_follow_up") try: # Find the pagination section again (scope might have changed after scroll/wait) - pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 5) + pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout) # Find the "Next" button using the correct aria-label selector and ensure it's not disabled next_button_element = None possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section) @@ -432,8 +434,19 @@ class AdExtractor(WebScrapingMixin): # Fallback to legacy selectors in case the breadcrumb structure is unexpected. LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids) - category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) - category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) + fallback_timeout = self._effective_timeout() + try: + category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line) + category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line) + except TimeoutError as exc: + LOG.error( + "Legacy breadcrumb selectors not found within %.1f seconds (collected ids: %s)", + fallback_timeout, + category_ids + ) + raise TimeoutError( + _("Unable to locate breadcrumb fallback selectors within %(seconds).1f seconds.") % {"seconds": fallback_timeout} + ) from exc href_first:str = str(category_first_part.attrs["href"]) href_second:str = str(category_second_part.attrs["href"]) cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1] diff --git a/src/kleinanzeigen_bot/model/config_model.py b/src/kleinanzeigen_bot/model/config_model.py index 5cdcb5b..a4fab8f 100644 --- a/src/kleinanzeigen_bot/model/config_model.py +++ b/src/kleinanzeigen_bot/model/config_model.py @@ -114,6 +114,55 @@ class CaptchaConfig(ContextualModel): restart_delay:str = "6h" +class TimeoutConfig(ContextualModel): + multiplier:float = Field( + default = 1.0, + ge = 0.1, + description = "Global multiplier applied to all timeout values." + ) + default:float = Field(default = 5.0, ge = 0.0, description = "Baseline timeout for DOM interactions.") + page_load:float = Field(default = 15.0, ge = 1.0, description = "Page load timeout for web_open.") + captcha_detection:float = Field(default = 2.0, ge = 0.1, description = "Timeout for captcha iframe detection.") + sms_verification:float = Field(default = 4.0, ge = 0.1, description = "Timeout for SMS verification prompts.") + gdpr_prompt:float = Field(default = 10.0, ge = 1.0, description = "Timeout for GDPR/consent dialogs.") + publishing_result:float = Field(default = 300.0, ge = 10.0, description = "Timeout for publishing result checks.") + publishing_confirmation:float = Field(default = 20.0, ge = 1.0, description = "Timeout for publish confirmation redirect.") + pagination_initial:float = Field(default = 10.0, ge = 1.0, description = "Timeout for initial pagination lookup.") + pagination_follow_up:float = Field(default = 5.0, ge = 1.0, description = "Timeout for subsequent pagination navigation.") + quick_dom:float = Field(default = 2.0, ge = 0.1, description = "Generic short timeout for transient UI.") + update_check:float = Field(default = 10.0, ge = 1.0, description = "Timeout for GitHub update checks.") + chrome_remote_probe:float = Field(default = 2.0, ge = 0.1, description = "Timeout for local remote-debugging probes.") + chrome_remote_debugging:float = Field(default = 5.0, ge = 1.0, description = "Timeout for remote debugging API calls.") + chrome_binary_detection:float = Field(default = 10.0, ge = 1.0, description = "Timeout for chrome --version subprocesses.") + retry_enabled:bool = Field(default = True, description = "Enable built-in retry/backoff for DOM operations.") + retry_max_attempts:int = Field(default = 2, ge = 1, description = "Max retry attempts when retry is enabled.") + retry_backoff_factor:float = Field(default = 1.5, ge = 1.0, description = "Exponential factor applied per retry attempt.") + + def resolve(self, key:str = "default", override:float | None = None) -> float: + """ + Return the base timeout (seconds) for the given key without applying modifiers. + """ + if override is not None: + return float(override) + + if key == "default": + return float(self.default) + + attr = getattr(self, key, None) + if isinstance(attr, (int, float)): + return float(attr) + + return float(self.default) + + def effective(self, key:str = "default", override:float | None = None, *, attempt:int = 0) -> float: + """ + Return the effective timeout (seconds) with multiplier/backoff applied. + """ + base = self.resolve(key, override) + backoff = self.retry_backoff_factor ** attempt if attempt > 0 else 1.0 + return base * self.multiplier * backoff + + def _validate_glob_pattern(v:str) -> str: if not v.strip(): raise ValueError("must be a non-empty, non-blank glob pattern") @@ -154,6 +203,7 @@ Example: login:LoginConfig = Field(default_factory = LoginConfig.model_construct, description = "Login credentials") captcha:CaptchaConfig = Field(default_factory = CaptchaConfig) update_check:UpdateCheckConfig = Field(default_factory = UpdateCheckConfig, description = "Update check configuration") + timeouts:TimeoutConfig = Field(default_factory = TimeoutConfig, description = "Centralized timeout configuration.") def with_values(self, values:dict[str, Any]) -> Config: return Config.model_validate( diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index f4fefb2..c7cf78f 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -219,6 +219,8 @@ kleinanzeigen_bot/extract.py: _extract_category_from_ad_page: "Breadcrumb container 'vap-brdcrmb' not found; cannot extract ad category: %s": "Breadcrumb-Container 'vap-brdcrmb' nicht gefunden; kann Anzeigenkategorie nicht extrahieren: %s" "Falling back to legacy breadcrumb selectors; collected ids: %s": "Weiche auf ältere Breadcrumb-Selektoren aus; gesammelte IDs: %s" + "Legacy breadcrumb selectors not found within %.1f seconds (collected ids: %s)": "Ältere Breadcrumb-Selektoren nicht innerhalb von %.1f Sekunden gefunden (gesammelte IDs: %s)" + "Unable to locate breadcrumb fallback selectors within %(seconds).1f seconds.": "Ältere Breadcrumb-Selektoren konnten nicht innerhalb von %(seconds).1f Sekunden gefunden werden." ################################################# kleinanzeigen_bot/utils/i18n.py: @@ -398,11 +400,6 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py: web_check: "Unsupported attribute: %s": "Nicht unterstütztes Attribut: %s" - web_find: - "Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s" - - web_find_all: - "Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s" close_browser_session: "Closing Browser session...": "Schließe Browser-Sitzung..." @@ -417,6 +414,12 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py: web_request: " -> HTTP %s [%s]...": " -> HTTP %s [%s]..." + _web_find_once: + "Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s" + + _web_find_all_once: + "Unsupported selector type: %s": "Nicht unterstützter Selektor-Typ: %s" + diagnose_browser_issues: "=== Browser Connection Diagnostics ===": "=== Browser-Verbindungsdiagnose ===" "=== End Diagnostics ===": "=== Ende der Diagnose ===" @@ -434,6 +437,8 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py: "(info) Remote debugging port configured: %d": "(Info) Remote-Debugging-Port konfiguriert: %d" "(info) Remote debugging port is not open": "(Info) Remote-Debugging-Port ist nicht offen" + "(warn) Unable to inspect browser processes: %s": "(Warnung) Browser-Prozesse konnten nicht überprüft werden: %s" + "(info) No browser processes currently running": "(Info) Derzeit keine Browser-Prozesse aktiv" "(fail) Running as root - this can cause browser issues": "(Fehler) Läuft als Root - dies kann Browser-Probleme verursachen" diff --git a/src/kleinanzeigen_bot/update_checker.py b/src/kleinanzeigen_bot/update_checker.py index 7c7f429..35ccbb3 100644 --- a/src/kleinanzeigen_bot/update_checker.py +++ b/src/kleinanzeigen_bot/update_checker.py @@ -49,6 +49,10 @@ class UpdateChecker: """ return __version__ + def _request_timeout(self) -> float: + """Return the effective timeout for HTTP calls.""" + return self.config.timeouts.effective("update_check") + def _get_commit_hash(self, version:str) -> str | None: """Extract the commit hash from a version string. @@ -74,7 +78,7 @@ class UpdateChecker: try: response = requests.get( f"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases/tags/{tag_name}", - timeout = 10 + timeout = self._request_timeout() ) response.raise_for_status() data = response.json() @@ -97,7 +101,7 @@ class UpdateChecker: try: response = requests.get( f"https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/commits/{commit}", - timeout = 10 + timeout = self._request_timeout() ) response.raise_for_status() data = response.json() @@ -148,7 +152,7 @@ class UpdateChecker: # Use /releases/latest endpoint for stable releases response = requests.get( "https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases/latest", - timeout = 10 + timeout = self._request_timeout() ) response.raise_for_status() release = response.json() @@ -160,7 +164,7 @@ class UpdateChecker: # Use /releases endpoint and select the most recent prerelease response = requests.get( "https://api.github.com/repos/Second-Hand-Friends/kleinanzeigen-bot/releases", - timeout = 10 + timeout = self._request_timeout() ) response.raise_for_status() releases = response.json() diff --git a/src/kleinanzeigen_bot/utils/chrome_version_detector.py b/src/kleinanzeigen_bot/utils/chrome_version_detector.py index 7dd024e..ffcc2ce 100644 --- a/src/kleinanzeigen_bot/utils/chrome_version_detector.py +++ b/src/kleinanzeigen_bot/utils/chrome_version_detector.py @@ -78,23 +78,25 @@ def _normalize_browser_name(browser_name:str) -> str: return "Chrome" -def detect_chrome_version_from_binary(binary_path:str) -> ChromeVersionInfo | None: +def detect_chrome_version_from_binary(binary_path:str, *, timeout:float | None = None) -> ChromeVersionInfo | None: """ Detect Chrome version by running the browser binary. Args: binary_path: Path to the Chrome binary + timeout: Optional timeout (seconds) for the subprocess call Returns: ChromeVersionInfo if successful, None if detection fails """ + effective_timeout = timeout if timeout is not None else 10.0 try: # Run browser with --version flag result = subprocess.run( # noqa: S603 [binary_path, "--version"], check = False, capture_output = True, text = True, - timeout = 10 + timeout = effective_timeout ) if result.returncode != 0: @@ -114,28 +116,30 @@ def detect_chrome_version_from_binary(binary_path:str) -> ChromeVersionInfo | No return ChromeVersionInfo(version_string, major_version, browser_name) except subprocess.TimeoutExpired: - LOG.debug("Browser version command timed out") + LOG.debug("Browser version command timed out after %.1fs", effective_timeout) return None except (subprocess.SubprocessError, ValueError) as e: LOG.debug("Failed to detect browser version: %s", str(e)) return None -def detect_chrome_version_from_remote_debugging(host:str = "127.0.0.1", port:int = 9222) -> ChromeVersionInfo | None: +def detect_chrome_version_from_remote_debugging(host:str = "127.0.0.1", port:int = 9222, *, timeout:float | None = None) -> ChromeVersionInfo | None: """ Detect Chrome version from remote debugging API. Args: host: Remote debugging host port: Remote debugging port + timeout: Optional timeout (seconds) for the HTTP request Returns: ChromeVersionInfo if successful, None if detection fails """ + effective_timeout = timeout if timeout is not None else 5.0 try: # Query the remote debugging API url = f"http://{host}:{port}/json/version" - response = urllib.request.urlopen(url, timeout = 5) # noqa: S310 + response = urllib.request.urlopen(url, timeout = effective_timeout) # noqa: S310 version_data = json.loads(response.read().decode()) # Extract version information @@ -200,7 +204,10 @@ def validate_chrome_136_configuration(browser_arguments:list[str], user_data_dir def get_chrome_version_diagnostic_info( binary_path:str | None = None, remote_host:str = "127.0.0.1", - remote_port:int | None = None + remote_port:int | None = None, + *, + remote_timeout:float | None = None, + binary_timeout:float | None = None ) -> dict[str, Any]: """ Get comprehensive Chrome version diagnostic information. @@ -209,6 +216,8 @@ def get_chrome_version_diagnostic_info( binary_path: Path to Chrome binary (optional) remote_host: Remote debugging host remote_port: Remote debugging port (optional) + remote_timeout: Timeout for remote debugging detection + binary_timeout: Timeout for binary detection Returns: Dictionary with diagnostic information @@ -223,7 +232,7 @@ def get_chrome_version_diagnostic_info( # Try binary detection if binary_path: - version_info = detect_chrome_version_from_binary(binary_path) + version_info = detect_chrome_version_from_binary(binary_path, timeout = binary_timeout) if version_info: diagnostic_info["binary_detection"] = { "version_string": version_info.version_string, @@ -235,7 +244,7 @@ def get_chrome_version_diagnostic_info( # Try remote debugging detection if remote_port: - version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port) + version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port, timeout = remote_timeout) if version_info: diagnostic_info["remote_detection"] = { "version_string": version_info.version_string, diff --git a/src/kleinanzeigen_bot/utils/web_scraping_mixin.py b/src/kleinanzeigen_bot/utils/web_scraping_mixin.py index a4e4669..b4aaa91 100644 --- a/src/kleinanzeigen_bot/utils/web_scraping_mixin.py +++ b/src/kleinanzeigen_bot/utils/web_scraping_mixin.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ import asyncio, enum, inspect, json, os, platform, secrets, shutil, subprocess, urllib.request # isort: skip # noqa: S404 -from collections.abc import Callable, Coroutine, Iterable +from collections.abc import Awaitable, Callable, Coroutine, Iterable from gettext import gettext as _ -from typing import Any, Final, cast +from typing import Any, Final, Optional, cast try: from typing import Never # type: ignore[attr-defined,unused-ignore] # mypy @@ -15,10 +15,13 @@ import nodriver, psutil # isort: skip from typing import TYPE_CHECKING, TypeGuard from nodriver.core.browser import Browser -from nodriver.core.config import Config +from nodriver.core.config import Config as NodriverConfig from nodriver.core.element import Element from nodriver.core.tab import Tab as Page +from kleinanzeigen_bot.model.config_model import Config as BotConfig +from kleinanzeigen_bot.model.config_model import TimeoutConfig + from . import loggers, net from .chrome_version_detector import ( ChromeVersionInfo, @@ -32,6 +35,7 @@ from .misc import T, ensure if TYPE_CHECKING: from nodriver.cdp.runtime import RemoteObject + # Constants for RemoteObject conversion _KEY_VALUE_PAIR_SIZE = 2 @@ -102,6 +106,69 @@ class WebScrapingMixin: self.browser_config:Final[BrowserConfig] = BrowserConfig() self.browser:Browser = None # pyright: ignore[reportAttributeAccessIssue] self.page:Page = None # pyright: ignore[reportAttributeAccessIssue] + self._default_timeout_config:TimeoutConfig | None = None + self.config:BotConfig = cast(BotConfig, None) + + def _get_timeout_config(self) -> TimeoutConfig: + config = getattr(self, "config", None) + timeouts:TimeoutConfig | None = None + if config is not None: + timeouts = cast(Optional[TimeoutConfig], getattr(config, "timeouts", None)) + if timeouts is not None: + return timeouts + + if self._default_timeout_config is None: + self._default_timeout_config = TimeoutConfig() + return self._default_timeout_config + + def _timeout(self, key:str = "default", override:float | None = None) -> float: + """ + Return the base timeout (seconds) for a given key without applying multipliers. + """ + return self._get_timeout_config().resolve(key, override) + + def _effective_timeout(self, key:str = "default", override:float | None = None, *, attempt:int = 0) -> float: + """ + Return the effective timeout (seconds) with multiplier/backoff applied. + """ + return self._get_timeout_config().effective(key, override, attempt = attempt) + + def _timeout_attempts(self) -> int: + cfg = self._get_timeout_config() + if not cfg.retry_enabled: + return 1 + # Always perform the initial attempt plus the configured number of retries. + return 1 + cfg.retry_max_attempts + + async def _run_with_timeout_retries( + self, + operation:Callable[[float], Awaitable[T]], + *, + description:str, + key:str = "default", + override:float | None = None + ) -> T: + """ + Execute an async callable with retry/backoff handling for TimeoutError. + """ + attempts = self._timeout_attempts() + + for attempt in range(attempts): + effective_timeout = self._effective_timeout(key, override, attempt = attempt) + try: + return await operation(effective_timeout) + except TimeoutError: + if attempt >= attempts - 1: + raise + LOG.debug( + "Retrying %s after TimeoutError (attempt %d/%d, timeout %.1fs)", + description, + attempt + 1, + attempts, + effective_timeout + ) + + raise TimeoutError(f"{description} failed without executing operation") async def create_browser_session(self) -> None: LOG.info("Creating Browser session...") @@ -137,7 +204,7 @@ class WebScrapingMixin: f"Make sure the browser is running and the port is not blocked by firewall.") try: - cfg = Config( + cfg = NodriverConfig( browser_executable_path = self.browser_config.binary_location # actually not necessary but nodriver fails without ) cfg.host = remote_host @@ -207,7 +274,7 @@ class WebScrapingMixin: if self.browser_config.user_data_dir: LOG.info(" -> Browser user data dir: %s", self.browser_config.user_data_dir) - cfg = Config( + cfg = NodriverConfig( headless = False, browser_executable_path = self.browser_config.binary_location, browser_args = browser_args, @@ -355,7 +422,8 @@ class WebScrapingMixin: LOG.info("(ok) Remote debugging port is open") # Try to get more information about the debugging endpoint try: - response = urllib.request.urlopen(f"http://127.0.0.1:{remote_port}/json/version", timeout = 2) + probe_timeout = self._effective_timeout("chrome_remote_probe") + response = urllib.request.urlopen(f"http://127.0.0.1:{remote_port}/json/version", timeout = probe_timeout) version_info = json.loads(response.read().decode()) LOG.info("(ok) Remote debugging API accessible - Browser: %s", version_info.get("Browser", "Unknown")) except Exception as e: @@ -378,30 +446,34 @@ class WebScrapingMixin: except (AssertionError, TypeError): target_browser_name = "" - for proc in psutil.process_iter(["pid", "name", "cmdline"]): - try: - proc_name = proc.info["name"] or "" - cmdline = proc.info["cmdline"] or [] + try: + for proc in psutil.process_iter(["pid", "name", "cmdline"]): + try: + proc_name = proc.info["name"] or "" + cmdline = proc.info["cmdline"] or [] - # Check if this is a browser process relevant to our diagnostics - is_relevant_browser = False + # Check if this is a browser process relevant to our diagnostics + is_relevant_browser = False - # Is this the target browser? - is_target_browser = target_browser_name and target_browser_name in proc_name.lower() + # Is this the target browser? + is_target_browser = target_browser_name and target_browser_name in proc_name.lower() - # Does it have remote debugging? - has_remote_debugging = cmdline and any(arg.startswith("--remote-debugging-port=") for arg in cmdline) + # Does it have remote debugging? + has_remote_debugging = cmdline and any(arg.startswith("--remote-debugging-port=") for arg in cmdline) - # Detect target browser processes for diagnostics - if is_target_browser: - is_relevant_browser = True - # Add debugging status to the process info for better diagnostics - proc.info["has_remote_debugging"] = has_remote_debugging + # Detect target browser processes for diagnostics + if is_target_browser: + is_relevant_browser = True + # Add debugging status to the process info for better diagnostics + proc.info["has_remote_debugging"] = has_remote_debugging - if is_relevant_browser: - browser_processes.append(proc.info) - except (psutil.NoSuchProcess, psutil.AccessDenied): - pass + if is_relevant_browser: + browser_processes.append(proc.info) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + except (psutil.Error, PermissionError) as exc: + LOG.warning("(warn) Unable to inspect browser processes: %s", exc) + browser_processes = [] if browser_processes: LOG.info("(info) Found %d browser processes running", len(browser_processes)) @@ -486,15 +558,17 @@ class WebScrapingMixin: raise AssertionError(_("Installed browser could not be detected")) async def web_await(self, condition:Callable[[], T | Never | Coroutine[Any, Any, T | Never]], *, - timeout:int | float = 5, timeout_error_message:str = "") -> T: + timeout:int | float | None = None, timeout_error_message:str = "", apply_multiplier:bool = True) -> T: """ Blocks/waits until the given condition is met. - :param timeout: timeout in seconds + :param timeout: timeout in seconds (base value, multiplier applied unless disabled) :raises TimeoutError: if element could not be found within time """ loop = asyncio.get_running_loop() start_at = loop.time() + base_timeout = timeout if timeout is not None else self._timeout() + effective_timeout = self._effective_timeout(override = base_timeout) if apply_multiplier else base_timeout while True: await self.page @@ -506,13 +580,13 @@ class WebScrapingMixin: return result except Exception as ex1: ex = ex1 - if loop.time() - start_at > timeout: + if loop.time() - start_at > effective_timeout: if ex: raise ex - raise TimeoutError(timeout_error_message or f"Condition not met within {timeout} seconds") + raise TimeoutError(timeout_error_message or f"Condition not met within {effective_timeout} seconds") await self.page.sleep(0.5) - async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float = 5) -> bool: + async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float | None = None) -> bool: """ Locates an HTML element and returns a state. @@ -559,7 +633,7 @@ class WebScrapingMixin: """)) raise AssertionError(_("Unsupported attribute: %s") % attr) - async def web_click(self, selector_type:By, selector_value:str, *, timeout:int | float = 5) -> Element: + async def web_click(self, selector_type:By, selector_value:str, *, timeout:int | float | None = None) -> Element: """ Locates an HTML element by ID. @@ -652,91 +726,130 @@ class WebScrapingMixin: # Return primitive values as-is return data - async def web_find(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> Element: + async def web_find(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> Element: """ Locates an HTML element by the given selector type and value. - :param timeout: timeout in seconds + :param timeout: timeout in seconds (base value before multiplier/backoff) :raises TimeoutError: if element could not be found within time """ + + async def attempt(effective_timeout:float) -> Element: + return await self._web_find_once(selector_type, selector_value, effective_timeout, parent = parent) + + return await self._run_with_timeout_retries( + attempt, + description = f"web_find({selector_type.name}, {selector_value})", + key = "default", + override = timeout + ) + + async def web_find_all(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> list[Element]: + """ + Locates multiple HTML elements by the given selector type and value. + + :param timeout: timeout in seconds (base value before multiplier/backoff) + :raises TimeoutError: if element could not be found within time + """ + + async def attempt(effective_timeout:float) -> list[Element]: + return await self._web_find_all_once(selector_type, selector_value, effective_timeout, parent = parent) + + return await self._run_with_timeout_retries( + attempt, + description = f"web_find_all({selector_type.name}, {selector_value})", + key = "default", + override = timeout + ) + + async def _web_find_once(self, selector_type:By, selector_value:str, timeout:float, *, parent:Element | None = None) -> Element: + timeout_suffix = f" within {timeout} seconds." + match selector_type: case By.ID: escaped_id = selector_value.translate(METACHAR_ESCAPER) return await self.web_await( lambda: self.page.query_selector(f"#{escaped_id}", parent), timeout = timeout, - timeout_error_message = f"No HTML element found with ID '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML element found with ID '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.CLASS_NAME: escaped_classname = selector_value.translate(METACHAR_ESCAPER) return await self.web_await( lambda: self.page.query_selector(f".{escaped_classname}", parent), timeout = timeout, - timeout_error_message = f"No HTML element found with CSS class '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML element found with CSS class '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.TAG_NAME: return await self.web_await( lambda: self.page.query_selector(selector_value, parent), timeout = timeout, - timeout_error_message = f"No HTML element found of tag <{selector_value}> within {timeout} seconds.") + timeout_error_message = f"No HTML element found of tag <{selector_value}>{timeout_suffix}", + apply_multiplier = False) case By.CSS_SELECTOR: return await self.web_await( lambda: self.page.query_selector(selector_value, parent), timeout = timeout, - timeout_error_message = f"No HTML element found using CSS selector '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML element found using CSS selector '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.TEXT: ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}") return await self.web_await( lambda: self.page.find_element_by_text(selector_value, best_match = True), timeout = timeout, - timeout_error_message = f"No HTML element found containing text '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML element found containing text '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.XPATH: ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}") return await self.web_await( lambda: self.page.find_element_by_text(selector_value, best_match = True), timeout = timeout, - timeout_error_message = f"No HTML element found using XPath '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML element found using XPath '{selector_value}'{timeout_suffix}", + apply_multiplier = False) raise AssertionError(_("Unsupported selector type: %s") % selector_type) - async def web_find_all(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> list[Element]: - """ - Locates an HTML element by ID. + async def _web_find_all_once(self, selector_type:By, selector_value:str, timeout:float, *, parent:Element | None = None) -> list[Element]: + timeout_suffix = f" within {timeout} seconds." - :param timeout: timeout in seconds - :raises TimeoutError: if element could not be found within time - """ match selector_type: case By.CLASS_NAME: escaped_classname = selector_value.translate(METACHAR_ESCAPER) return await self.web_await( lambda: self.page.query_selector_all(f".{escaped_classname}", parent), timeout = timeout, - timeout_error_message = f"No HTML elements found with CSS class '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML elements found with CSS class '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.CSS_SELECTOR: return await self.web_await( lambda: self.page.query_selector_all(selector_value, parent), timeout = timeout, - timeout_error_message = f"No HTML elements found using CSS selector '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML elements found using CSS selector '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.TAG_NAME: return await self.web_await( lambda: self.page.query_selector_all(selector_value, parent), timeout = timeout, - timeout_error_message = f"No HTML elements found of tag <{selector_value}> within {timeout} seconds.") + timeout_error_message = f"No HTML elements found of tag <{selector_value}>{timeout_suffix}", + apply_multiplier = False) case By.TEXT: ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}") return await self.web_await( lambda: self.page.find_elements_by_text(selector_value), timeout = timeout, - timeout_error_message = f"No HTML elements found containing text '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML elements found containing text '{selector_value}'{timeout_suffix}", + apply_multiplier = False) case By.XPATH: ensure(not parent, f"Specifying a parent element currently not supported with selector type: {selector_type}") return await self.web_await( lambda: self.page.find_elements_by_text(selector_value), timeout = timeout, - timeout_error_message = f"No HTML elements found using XPath '{selector_value}' within {timeout} seconds.") + timeout_error_message = f"No HTML elements found using XPath '{selector_value}'{timeout_suffix}", + apply_multiplier = False) raise AssertionError(_("Unsupported selector type: %s") % selector_type) - async def web_input(self, selector_type:By, selector_value:str, text:str | int, *, timeout:int | float = 5) -> Element: + async def web_input(self, selector_type:By, selector_value:str, text:str | int, *, timeout:int | float | None = None) -> Element: """ Enters text into an HTML input field. @@ -749,10 +862,10 @@ class WebScrapingMixin: await self.web_sleep() return input_field - async def web_open(self, url:str, *, timeout:int | float = 15_000, reload_if_already_open:bool = False) -> None: + async def web_open(self, url:str, *, timeout:int | float | None = None, reload_if_already_open:bool = False) -> None: """ :param url: url to open in browser - :param timeout: timespan in seconds within the page needs to be loaded + :param timeout: timespan in seconds within the page needs to be loaded (base value) :param reload_if_already_open: if False does nothing if the URL is already open in the browser :raises TimeoutException: if page did not open within given timespan """ @@ -761,10 +874,15 @@ class WebScrapingMixin: LOG.debug(" => skipping, [%s] is already open", url) return self.page = await self.browser.get(url = url, new_tab = False, new_window = False) - await self.web_await(lambda: self.web_execute("document.readyState == 'complete'"), timeout = timeout, - timeout_error_message = f"Page did not finish loading within {timeout} seconds.") + page_timeout = self._effective_timeout("page_load", timeout) + await self.web_await( + lambda: self.web_execute("document.readyState == 'complete'"), + timeout = page_timeout, + timeout_error_message = f"Page did not finish loading within {page_timeout} seconds.", + apply_multiplier = False + ) - async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float = 5) -> str: + async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> str: return str(await (await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)).apply(""" function (elem) { let sel = window.getSelection() @@ -835,7 +953,7 @@ class WebScrapingMixin: await self.web_execute(f"window.scrollTo(0, {current_y_pos})") await asyncio.sleep(scroll_length / scroll_speed / 2) # double speed - async def web_select(self, selector_type:By, selector_value:str, selected_value:Any, timeout:int | float = 5) -> Element: + async def web_select(self, selector_type:By, selector_value:str, selected_value:Any, timeout:int | float | None = None) -> Element: """ Selects an