mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 18:41:50 +01:00
feat: add grouped selector timeout fallback for login detection (#843)
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||
import asyncio, enum, inspect, json, os, platform, secrets, shutil, subprocess, urllib.request # isort: skip # noqa: S404
|
||||
from collections.abc import Awaitable, Callable, Coroutine, Iterable
|
||||
from collections.abc import Awaitable, Callable, Coroutine, Iterable, Sequence
|
||||
from gettext import gettext as _
|
||||
from pathlib import Path, PureWindowsPath
|
||||
from typing import Any, Final, Optional, cast
|
||||
@@ -39,6 +39,9 @@ if TYPE_CHECKING:
|
||||
|
||||
# Constants for RemoteObject conversion
|
||||
_KEY_VALUE_PAIR_SIZE = 2
|
||||
_PRIMARY_SELECTOR_BUDGET_RATIO:Final[float] = 0.70
|
||||
_BACKUP_SELECTOR_BUDGET_CAP_SECONDS:Final[float] = 0.75
|
||||
_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS:Final[float] = 0.25
|
||||
|
||||
|
||||
def _resolve_user_data_dir_paths(arg_value:str, config_value:str) -> tuple[Any, Any]:
|
||||
@@ -254,6 +257,153 @@ class WebScrapingMixin:
|
||||
|
||||
raise TimeoutError(f"{description} failed without executing operation")
|
||||
|
||||
@staticmethod
|
||||
def _allocate_selector_group_budgets(total_timeout:float, selector_count:int) -> list[float]:
|
||||
"""Allocate a shared timeout budget across selector alternatives.
|
||||
|
||||
Strategy:
|
||||
- Give the first selector a preferred share via `_PRIMARY_SELECTOR_BUDGET_RATIO`.
|
||||
- Keep a minimum floor `_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS` per selector.
|
||||
- Cap backup slices with `_BACKUP_SELECTOR_BUDGET_CAP_SECONDS`.
|
||||
- Reassign final-backup surplus to the primary slot to preserve total timeout.
|
||||
"""
|
||||
if selector_count <= 0:
|
||||
raise ValueError(_("selector_count must be > 0"))
|
||||
if selector_count == 1:
|
||||
return [max(total_timeout, 0.0)]
|
||||
if total_timeout <= 0:
|
||||
return [0.0 for _ in range(selector_count)]
|
||||
|
||||
# If total_timeout cannot satisfy per-slot floor, split equally to preserve total budget.
|
||||
floor_total = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * selector_count
|
||||
if total_timeout < floor_total:
|
||||
equal_share = total_timeout / selector_count
|
||||
return [equal_share for _ in range(selector_count)]
|
||||
|
||||
# Reserve minimum floor for backups before sizing the primary slice.
|
||||
reserve_for_backups = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * (selector_count - 1)
|
||||
# Primary gets preferred ratio, but never steals the reserved backup floors.
|
||||
primary = min(total_timeout * _PRIMARY_SELECTOR_BUDGET_RATIO, total_timeout - reserve_for_backups)
|
||||
primary = max(primary, _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS)
|
||||
budgets = [primary]
|
||||
remaining = total_timeout - primary
|
||||
|
||||
for index in range(selector_count - 1):
|
||||
is_last_backup = index == selector_count - 2
|
||||
if is_last_backup:
|
||||
# Last backup is capped; any surplus is folded back into primary to keep sum == total_timeout.
|
||||
alloc = min(remaining, _BACKUP_SELECTOR_BUDGET_CAP_SECONDS)
|
||||
budgets.append(alloc)
|
||||
surplus = remaining - alloc
|
||||
if surplus > 0:
|
||||
budgets[0] += surplus
|
||||
continue
|
||||
|
||||
remaining_slots_after_this = selector_count - len(budgets) - 1
|
||||
# Keep floor reserve for remaining backups, then clamp this slice to floor/cap bounds.
|
||||
min_reserve = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * remaining_slots_after_this
|
||||
alloc = remaining - min_reserve
|
||||
alloc = max(_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS, alloc)
|
||||
alloc = min(_BACKUP_SELECTOR_BUDGET_CAP_SECONDS, alloc)
|
||||
budgets.append(alloc)
|
||||
remaining -= alloc
|
||||
|
||||
return budgets
|
||||
|
||||
async def web_find_first_available(
|
||||
self,
|
||||
selectors:Sequence[tuple[By, str]],
|
||||
*,
|
||||
parent:Element | None = None,
|
||||
timeout:int | float | None = None,
|
||||
key:str = "default",
|
||||
description:str | None = None,
|
||||
) -> tuple[Element, int]:
|
||||
"""
|
||||
Find the first matching selector from an ordered group using a shared timeout budget.
|
||||
"""
|
||||
if not selectors:
|
||||
raise ValueError(_("selectors must contain at least one selector"))
|
||||
|
||||
async def attempt(effective_timeout:float) -> tuple[Element, int]:
|
||||
budgets = self._allocate_selector_group_budgets(effective_timeout, len(selectors))
|
||||
failures:list[str] = []
|
||||
for index, ((selector_type, selector_value), candidate_timeout) in enumerate(zip(selectors, budgets, strict = True)):
|
||||
try:
|
||||
element = await self._web_find_once(selector_type, selector_value, candidate_timeout, parent = parent)
|
||||
LOG.debug(
|
||||
"Selector group matched candidate %d/%d (%s=%s) within %.2fs (group budget %.2fs)",
|
||||
index + 1,
|
||||
len(selectors),
|
||||
selector_type.name,
|
||||
selector_value,
|
||||
candidate_timeout,
|
||||
effective_timeout,
|
||||
)
|
||||
return element, index
|
||||
except TimeoutError as exc:
|
||||
failures.append(str(exc))
|
||||
LOG.debug(
|
||||
"Selector group candidate %d/%d timed out (%s=%s) after %.2fs (group budget %.2fs)",
|
||||
index + 1,
|
||||
len(selectors),
|
||||
selector_type.name,
|
||||
selector_value,
|
||||
candidate_timeout,
|
||||
effective_timeout,
|
||||
)
|
||||
|
||||
failure_summary = failures[-1] if failures else _("No selector candidates executed.")
|
||||
raise TimeoutError(
|
||||
_(
|
||||
"No HTML element found using selector group after trying %(count)d alternatives within %(timeout)s seconds."
|
||||
" Last error: %(error)s"
|
||||
)
|
||||
% {"count": len(selectors), "timeout": effective_timeout, "error": failure_summary}
|
||||
)
|
||||
|
||||
attempt_description = description or f"web_find_first_available({len(selectors)} selectors)"
|
||||
return await self._run_with_timeout_retries(attempt, description = attempt_description, key = key, override = timeout)
|
||||
|
||||
async def web_text_first_available(
|
||||
self,
|
||||
selectors:Sequence[tuple[By, str]],
|
||||
*,
|
||||
parent:Element | None = None,
|
||||
timeout:int | float | None = None,
|
||||
key:str = "default",
|
||||
description:str | None = None,
|
||||
) -> tuple[str, int]:
|
||||
"""
|
||||
Return visible text from the first selector that resolves from a selector group.
|
||||
"""
|
||||
element, matched_index = await self.web_find_first_available(
|
||||
selectors,
|
||||
parent = parent,
|
||||
timeout = timeout,
|
||||
key = key,
|
||||
description = description,
|
||||
)
|
||||
text = await self._extract_visible_text(element)
|
||||
return text, matched_index
|
||||
|
||||
async def _extract_visible_text(self, element:Element) -> str:
|
||||
"""Return visible text for a DOM element using user-selection extraction."""
|
||||
return str(
|
||||
await element.apply("""
|
||||
function (elem) {
|
||||
let sel = window.getSelection()
|
||||
sel.removeAllRanges()
|
||||
let range = document.createRange()
|
||||
range.selectNode(elem)
|
||||
sel.addRange(range)
|
||||
let visibleText = sel.toString().trim()
|
||||
sel.removeAllRanges()
|
||||
return visibleText
|
||||
}
|
||||
""")
|
||||
)
|
||||
|
||||
async def create_browser_session(self) -> None:
|
||||
LOG.info("Creating Browser session...")
|
||||
|
||||
@@ -699,11 +849,13 @@ class WebScrapingMixin:
|
||||
return result
|
||||
except Exception as ex1:
|
||||
ex = ex1
|
||||
if loop.time() - start_at > effective_timeout:
|
||||
elapsed = loop.time() - start_at
|
||||
if elapsed >= effective_timeout:
|
||||
if ex:
|
||||
raise ex
|
||||
raise TimeoutError(timeout_error_message or f"Condition not met within {effective_timeout} seconds")
|
||||
await self.page.sleep(0.5)
|
||||
remaining_timeout = max(effective_timeout - elapsed, 0.0)
|
||||
await self.page.sleep(min(0.5, remaining_timeout))
|
||||
|
||||
async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float | None = None) -> bool:
|
||||
"""
|
||||
@@ -1013,20 +1165,8 @@ class WebScrapingMixin:
|
||||
)
|
||||
|
||||
async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> str:
|
||||
return str(
|
||||
await (await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)).apply("""
|
||||
function (elem) {
|
||||
let sel = window.getSelection()
|
||||
sel.removeAllRanges()
|
||||
let range = document.createRange()
|
||||
range.selectNode(elem)
|
||||
sel.addRange(range)
|
||||
let visibleText = sel.toString().trim()
|
||||
sel.removeAllRanges()
|
||||
return visibleText
|
||||
}
|
||||
""")
|
||||
)
|
||||
element = await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)
|
||||
return await self._extract_visible_text(element)
|
||||
|
||||
async def web_sleep(self, min_ms:int = 1_000, max_ms:int = 2_500) -> None:
|
||||
duration = max_ms <= min_ms and min_ms or secrets.randbelow(max_ms - min_ms) + min_ms
|
||||
|
||||
Reference in New Issue
Block a user