feat: add grouped selector timeout fallback for login detection (#843)

This commit is contained in:
Jens
2026-02-27 19:11:49 +01:00
committed by GitHub
parent fc456f4abd
commit 38e0f97578
5 changed files with 335 additions and 52 deletions

View File

@@ -1246,23 +1246,27 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
effective_timeout,
)
# Try to find the standard element first
try:
user_info = await self.web_text(By.CLASS_NAME, "mr-medium", timeout = login_check_timeout)
if username in user_info.lower():
LOG.debug("Login detected via .mr-medium element")
return True
except TimeoutError:
LOG.debug("Timeout waiting for .mr-medium element after %.1fs", effective_timeout)
login_selectors = [
(By.CLASS_NAME, "mr-medium"),
(By.ID, "user-email"),
]
primary_selector_index = 0
# If standard element not found or didn't contain username, try the alternative
try:
user_info = await self.web_text(By.ID, "user-email", timeout = login_check_timeout)
user_info, matched_selector = await self.web_text_first_available(
login_selectors,
timeout = login_check_timeout,
key = "login_detection",
description = "login_detection(selector_group)",
)
if username in user_info.lower():
LOG.debug("Login detected via #user-email element")
if matched_selector == primary_selector_index:
LOG.debug("Login detected via .mr-medium element")
else:
LOG.debug("Login detected via #user-email element")
return True
except TimeoutError:
LOG.debug("Timeout waiting for #user-email element after %.1fs", effective_timeout)
LOG.debug("Timeout waiting for login detection selector group after %.1fs", effective_timeout)
if not include_probe:
LOG.debug("No login detected - neither .mr-medium nor #user-email found with username")

View File

@@ -98,9 +98,8 @@ kleinanzeigen_bot/__init__.py:
is_logged_in:
"Starting login detection (timeout: %.1fs base, %.1fs effective with multiplier/backoff)": "Starte Login-Erkennung (Timeout: %.1fs Basis, %.1fs effektiv mit Multiplikator/Backoff)"
"Login detected via .mr-medium element": "Login erkannt über .mr-medium Element"
"Timeout waiting for .mr-medium element after %.1fs": "Timeout beim Warten auf .mr-medium Element nach %.1fs"
"Login detected via #user-email element": "Login erkannt über #user-email Element"
"Timeout waiting for #user-email element after %.1fs": "Timeout beim Warten auf #user-email Element nach %.1fs"
"Timeout waiting for login detection selector group after %.1fs": "Timeout beim Warten auf die Login-Erkennungs-Selektorgruppe nach %.1fs"
"No login detected - neither .mr-medium nor #user-email found with username": "Kein Login erkannt - weder .mr-medium noch #user-email mit Benutzername gefunden"
"No login detected - DOM elements not found and server probe returned %s": "Kein Login erkannt - DOM-Elemente nicht gefunden und Server-Probe ergab %s"
@@ -533,6 +532,17 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
_record_timing:
"Timing collector failed for key=%s operation=%s: %s": "Zeitmessung fehlgeschlagen für key=%s operation=%s: %s"
_allocate_selector_group_budgets:
"selector_count must be > 0": "selector_count muss > 0 sein"
web_find_first_available:
"selectors must contain at least one selector": "selectors muss mindestens einen Selektor enthalten"
attempt:
"No selector candidates executed.": "Keine Selektor-Kandidaten ausgeführt."
? "No HTML element found using selector group after trying %(count)d alternatives within %(timeout)s seconds. Last error: %(error)s"
: "Kein HTML-Element über Selektorgruppe gefunden, nachdem %(count)d Alternativen innerhalb von %(timeout)s Sekunden versucht wurden. Letzter Fehler: %(error)s"
close_browser_session:
"Closing Browser session...": "Schließe Browser-Sitzung..."

View File

@@ -2,7 +2,7 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import asyncio, enum, inspect, json, os, platform, secrets, shutil, subprocess, urllib.request # isort: skip # noqa: S404
from collections.abc import Awaitable, Callable, Coroutine, Iterable
from collections.abc import Awaitable, Callable, Coroutine, Iterable, Sequence
from gettext import gettext as _
from pathlib import Path, PureWindowsPath
from typing import Any, Final, Optional, cast
@@ -39,6 +39,9 @@ if TYPE_CHECKING:
# Constants for RemoteObject conversion
_KEY_VALUE_PAIR_SIZE = 2
_PRIMARY_SELECTOR_BUDGET_RATIO:Final[float] = 0.70
_BACKUP_SELECTOR_BUDGET_CAP_SECONDS:Final[float] = 0.75
_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS:Final[float] = 0.25
def _resolve_user_data_dir_paths(arg_value:str, config_value:str) -> tuple[Any, Any]:
@@ -254,6 +257,153 @@ class WebScrapingMixin:
raise TimeoutError(f"{description} failed without executing operation")
@staticmethod
def _allocate_selector_group_budgets(total_timeout:float, selector_count:int) -> list[float]:
"""Allocate a shared timeout budget across selector alternatives.
Strategy:
- Give the first selector a preferred share via `_PRIMARY_SELECTOR_BUDGET_RATIO`.
- Keep a minimum floor `_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS` per selector.
- Cap backup slices with `_BACKUP_SELECTOR_BUDGET_CAP_SECONDS`.
- Reassign final-backup surplus to the primary slot to preserve total timeout.
"""
if selector_count <= 0:
raise ValueError(_("selector_count must be > 0"))
if selector_count == 1:
return [max(total_timeout, 0.0)]
if total_timeout <= 0:
return [0.0 for _ in range(selector_count)]
# If total_timeout cannot satisfy per-slot floor, split equally to preserve total budget.
floor_total = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * selector_count
if total_timeout < floor_total:
equal_share = total_timeout / selector_count
return [equal_share for _ in range(selector_count)]
# Reserve minimum floor for backups before sizing the primary slice.
reserve_for_backups = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * (selector_count - 1)
# Primary gets preferred ratio, but never steals the reserved backup floors.
primary = min(total_timeout * _PRIMARY_SELECTOR_BUDGET_RATIO, total_timeout - reserve_for_backups)
primary = max(primary, _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS)
budgets = [primary]
remaining = total_timeout - primary
for index in range(selector_count - 1):
is_last_backup = index == selector_count - 2
if is_last_backup:
# Last backup is capped; any surplus is folded back into primary to keep sum == total_timeout.
alloc = min(remaining, _BACKUP_SELECTOR_BUDGET_CAP_SECONDS)
budgets.append(alloc)
surplus = remaining - alloc
if surplus > 0:
budgets[0] += surplus
continue
remaining_slots_after_this = selector_count - len(budgets) - 1
# Keep floor reserve for remaining backups, then clamp this slice to floor/cap bounds.
min_reserve = _BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS * remaining_slots_after_this
alloc = remaining - min_reserve
alloc = max(_BACKUP_SELECTOR_BUDGET_FLOOR_SECONDS, alloc)
alloc = min(_BACKUP_SELECTOR_BUDGET_CAP_SECONDS, alloc)
budgets.append(alloc)
remaining -= alloc
return budgets
async def web_find_first_available(
self,
selectors:Sequence[tuple[By, str]],
*,
parent:Element | None = None,
timeout:int | float | None = None,
key:str = "default",
description:str | None = None,
) -> tuple[Element, int]:
"""
Find the first matching selector from an ordered group using a shared timeout budget.
"""
if not selectors:
raise ValueError(_("selectors must contain at least one selector"))
async def attempt(effective_timeout:float) -> tuple[Element, int]:
budgets = self._allocate_selector_group_budgets(effective_timeout, len(selectors))
failures:list[str] = []
for index, ((selector_type, selector_value), candidate_timeout) in enumerate(zip(selectors, budgets, strict = True)):
try:
element = await self._web_find_once(selector_type, selector_value, candidate_timeout, parent = parent)
LOG.debug(
"Selector group matched candidate %d/%d (%s=%s) within %.2fs (group budget %.2fs)",
index + 1,
len(selectors),
selector_type.name,
selector_value,
candidate_timeout,
effective_timeout,
)
return element, index
except TimeoutError as exc:
failures.append(str(exc))
LOG.debug(
"Selector group candidate %d/%d timed out (%s=%s) after %.2fs (group budget %.2fs)",
index + 1,
len(selectors),
selector_type.name,
selector_value,
candidate_timeout,
effective_timeout,
)
failure_summary = failures[-1] if failures else _("No selector candidates executed.")
raise TimeoutError(
_(
"No HTML element found using selector group after trying %(count)d alternatives within %(timeout)s seconds."
" Last error: %(error)s"
)
% {"count": len(selectors), "timeout": effective_timeout, "error": failure_summary}
)
attempt_description = description or f"web_find_first_available({len(selectors)} selectors)"
return await self._run_with_timeout_retries(attempt, description = attempt_description, key = key, override = timeout)
async def web_text_first_available(
self,
selectors:Sequence[tuple[By, str]],
*,
parent:Element | None = None,
timeout:int | float | None = None,
key:str = "default",
description:str | None = None,
) -> tuple[str, int]:
"""
Return visible text from the first selector that resolves from a selector group.
"""
element, matched_index = await self.web_find_first_available(
selectors,
parent = parent,
timeout = timeout,
key = key,
description = description,
)
text = await self._extract_visible_text(element)
return text, matched_index
async def _extract_visible_text(self, element:Element) -> str:
"""Return visible text for a DOM element using user-selection extraction."""
return str(
await element.apply("""
function (elem) {
let sel = window.getSelection()
sel.removeAllRanges()
let range = document.createRange()
range.selectNode(elem)
sel.addRange(range)
let visibleText = sel.toString().trim()
sel.removeAllRanges()
return visibleText
}
""")
)
async def create_browser_session(self) -> None:
LOG.info("Creating Browser session...")
@@ -699,11 +849,13 @@ class WebScrapingMixin:
return result
except Exception as ex1:
ex = ex1
if loop.time() - start_at > effective_timeout:
elapsed = loop.time() - start_at
if elapsed >= effective_timeout:
if ex:
raise ex
raise TimeoutError(timeout_error_message or f"Condition not met within {effective_timeout} seconds")
await self.page.sleep(0.5)
remaining_timeout = max(effective_timeout - elapsed, 0.0)
await self.page.sleep(min(0.5, remaining_timeout))
async def web_check(self, selector_type:By, selector_value:str, attr:Is, *, timeout:int | float | None = None) -> bool:
"""
@@ -1013,20 +1165,8 @@ class WebScrapingMixin:
)
async def web_text(self, selector_type:By, selector_value:str, *, parent:Element | None = None, timeout:int | float | None = None) -> str:
return str(
await (await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)).apply("""
function (elem) {
let sel = window.getSelection()
sel.removeAllRanges()
let range = document.createRange()
range.selectNode(elem)
sel.addRange(range)
let visibleText = sel.toString().trim()
sel.removeAllRanges()
return visibleText
}
""")
)
element = await self.web_find(selector_type, selector_value, parent = parent, timeout = timeout)
return await self._extract_visible_text(element)
async def web_sleep(self, min_ms:int = 1_000, max_ms:int = 2_500) -> None:
duration = max_ms <= min_ms and min_ms or secrets.randbelow(max_ms - min_ms) + min_ms