mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
## ℹ️ Description - Related issues: #671, #658 - Introduces configurable timeout controls plus retry/backoff handling for flaky DOM operations. We often see timeouts which are note reproducible in certain configurations. I suspect timeout issues based on a combination of internet speed, browser, os, age of the computer and the weather. This PR introduces a comprehensive config model to tweak timeouts. ## 📋 Changes Summary - add TimeoutConfig to the main config/schema and expose timeouts in README/docs - wire WebScrapingMixin, extractor, update checker, and browser diagnostics to honor the configurable timeouts and retries - update translations/tests to cover the new behaviour and ensure lint/mypy/pyright pipelines remain green ### ⚙️ Type of Change - [ ] 🐞 Bug fix (non-breaking change which fixes an issue) - [x] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Centralized, configurable timeout system for web interactions, detection flows, publishing, and pagination. * Optional retry with exponential backoff for operations that time out. * **Improvements** * Replaced fixed wait times with dynamic timeouts throughout workflows. * More informative timeout-related messages and diagnostics. * **Tests** * New and expanded test coverage for timeout behavior, pagination, diagnostics, and retry logic. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
264 lines
9.2 KiB
Python
264 lines
9.2 KiB
Python
# SPDX-FileCopyrightText: © Jens Bergmann and contributors
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
|
import json
|
|
import re
|
|
import subprocess # noqa: S404
|
|
import urllib.error
|
|
import urllib.request
|
|
from typing import Any, Final
|
|
|
|
from . import loggers
|
|
|
|
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
|
|
|
|
# Chrome 136 was released in March 2025 and introduced security changes
|
|
CHROME_136_VERSION = 136
|
|
|
|
|
|
class ChromeVersionInfo:
|
|
"""Information about a Chrome browser version."""
|
|
|
|
def __init__(self, version_string:str, major_version:int, browser_name:str = "Unknown") -> None:
|
|
self.version_string = version_string
|
|
self.major_version = major_version
|
|
self.browser_name = browser_name
|
|
|
|
@property
|
|
def is_chrome_136_plus(self) -> bool:
|
|
"""Check if this is Chrome version 136 or later."""
|
|
return self.major_version >= CHROME_136_VERSION
|
|
|
|
def __str__(self) -> str:
|
|
return f"{self.browser_name} {self.version_string} (major: {self.major_version})"
|
|
|
|
|
|
def parse_version_string(version_string:str) -> int:
|
|
"""
|
|
Parse a Chrome version string and extract the major version number.
|
|
|
|
Args:
|
|
version_string: Version string like "136.0.6778.0" or "136.0.6778.0 (Developer Build)"
|
|
|
|
Returns:
|
|
Major version number (e.g., 136)
|
|
|
|
Raises:
|
|
ValueError: If version string cannot be parsed
|
|
"""
|
|
# Extract version number from strings like:
|
|
# "136.0.6778.0"
|
|
# "136.0.6778.0 (Developer Build)"
|
|
# "136.0.6778.0 (Official Build) (x86_64)"
|
|
# "Google Chrome 136.0.6778.0"
|
|
# "Microsoft Edge 136.0.6778.0"
|
|
# "Chromium 136.0.6778.0"
|
|
match = re.search(r"(\d+)\.\d+\.\d+\.\d+", version_string)
|
|
if not match:
|
|
raise ValueError(f"Could not parse version string: {version_string}")
|
|
|
|
return int(match.group(1))
|
|
|
|
|
|
def _normalize_browser_name(browser_name:str) -> str:
|
|
"""
|
|
Normalize browser name for consistent detection.
|
|
|
|
Args:
|
|
browser_name: Raw browser name from detection
|
|
|
|
Returns:
|
|
Normalized browser name
|
|
"""
|
|
browser_name_lower = browser_name.lower()
|
|
if "edge" in browser_name_lower or "edg" in browser_name_lower:
|
|
return "Edge"
|
|
if "chromium" in browser_name_lower:
|
|
return "Chromium"
|
|
return "Chrome"
|
|
|
|
|
|
def detect_chrome_version_from_binary(binary_path:str, *, timeout:float | None = None) -> ChromeVersionInfo | None:
|
|
"""
|
|
Detect Chrome version by running the browser binary.
|
|
|
|
Args:
|
|
binary_path: Path to the Chrome binary
|
|
timeout: Optional timeout (seconds) for the subprocess call
|
|
|
|
Returns:
|
|
ChromeVersionInfo if successful, None if detection fails
|
|
"""
|
|
effective_timeout = timeout if timeout is not None else 10.0
|
|
try:
|
|
# Run browser with --version flag
|
|
result = subprocess.run( # noqa: S603
|
|
[binary_path, "--version"],
|
|
check = False, capture_output = True,
|
|
text = True,
|
|
timeout = effective_timeout
|
|
)
|
|
|
|
if result.returncode != 0:
|
|
LOG.debug("Browser version command failed: %s", result.stderr)
|
|
return None
|
|
|
|
output = result.stdout.strip()
|
|
major_version = parse_version_string(output)
|
|
|
|
# Extract just the version number for version_string
|
|
version_match = re.search(r"(\d+\.\d+\.\d+\.\d+)", output)
|
|
version_string = version_match.group(1) if version_match else output
|
|
|
|
# Determine browser name from binary path
|
|
browser_name = _normalize_browser_name(binary_path)
|
|
|
|
return ChromeVersionInfo(version_string, major_version, browser_name)
|
|
|
|
except subprocess.TimeoutExpired:
|
|
LOG.debug("Browser version command timed out after %.1fs", effective_timeout)
|
|
return None
|
|
except (subprocess.SubprocessError, ValueError) as e:
|
|
LOG.debug("Failed to detect browser version: %s", str(e))
|
|
return None
|
|
|
|
|
|
def detect_chrome_version_from_remote_debugging(host:str = "127.0.0.1", port:int = 9222, *, timeout:float | None = None) -> ChromeVersionInfo | None:
|
|
"""
|
|
Detect Chrome version from remote debugging API.
|
|
|
|
Args:
|
|
host: Remote debugging host
|
|
port: Remote debugging port
|
|
timeout: Optional timeout (seconds) for the HTTP request
|
|
|
|
Returns:
|
|
ChromeVersionInfo if successful, None if detection fails
|
|
"""
|
|
effective_timeout = timeout if timeout is not None else 5.0
|
|
try:
|
|
# Query the remote debugging API
|
|
url = f"http://{host}:{port}/json/version"
|
|
response = urllib.request.urlopen(url, timeout = effective_timeout) # noqa: S310
|
|
version_data = json.loads(response.read().decode())
|
|
|
|
# Extract version information
|
|
user_agent = version_data.get("User-Agent", "")
|
|
browser_name = _normalize_browser_name(version_data.get("Browser", "Unknown"))
|
|
|
|
# Parse version from User-Agent string
|
|
# Example: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.6778.0 Safari/537.36"
|
|
match = re.search(r"Chrome/(\d+)\.\d+\.\d+\.\d+", user_agent)
|
|
if not match:
|
|
LOG.debug("Could not parse Chrome version from User-Agent: %s", user_agent)
|
|
return None
|
|
|
|
major_version = int(match.group(1))
|
|
version_string = match.group(0).replace("Chrome/", "")
|
|
|
|
return ChromeVersionInfo(version_string, major_version, browser_name)
|
|
|
|
except urllib.error.URLError as e:
|
|
LOG.debug("Remote debugging API not accessible: %s", e)
|
|
return None
|
|
except json.JSONDecodeError as e:
|
|
LOG.debug("Invalid JSON response from remote debugging API: %s", e)
|
|
return None
|
|
except Exception as e:
|
|
LOG.debug("Failed to detect browser version from remote debugging: %s", str(e))
|
|
return None
|
|
|
|
|
|
def validate_chrome_136_configuration(browser_arguments:list[str], user_data_dir:str | None) -> tuple[bool, str]:
|
|
"""
|
|
Validate configuration for Chrome/Edge 136+ security requirements.
|
|
|
|
Chrome/Edge 136+ requires --user-data-dir to be specified for security reasons.
|
|
|
|
Args:
|
|
browser_arguments: List of browser arguments
|
|
user_data_dir: User data directory configuration
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
# Check if user-data-dir is specified in arguments
|
|
has_user_data_dir_arg = any(
|
|
arg.startswith("--user-data-dir=")
|
|
for arg in browser_arguments
|
|
)
|
|
|
|
# Check if user_data_dir is configured
|
|
has_user_data_dir_config = user_data_dir is not None and user_data_dir.strip()
|
|
|
|
if not has_user_data_dir_arg and not has_user_data_dir_config:
|
|
return False, (
|
|
"Chrome/Edge 136+ requires --user-data-dir to be specified. "
|
|
"Add --user-data-dir=/path/to/directory to your browser arguments and "
|
|
'user_data_dir: "/path/to/directory" to your configuration.'
|
|
)
|
|
|
|
return True, ""
|
|
|
|
|
|
def get_chrome_version_diagnostic_info(
|
|
binary_path:str | None = None,
|
|
remote_host:str = "127.0.0.1",
|
|
remote_port:int | None = None,
|
|
*,
|
|
remote_timeout:float | None = None,
|
|
binary_timeout:float | None = None
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Get comprehensive Chrome version diagnostic information.
|
|
|
|
Args:
|
|
binary_path: Path to Chrome binary (optional)
|
|
remote_host: Remote debugging host
|
|
remote_port: Remote debugging port (optional)
|
|
remote_timeout: Timeout for remote debugging detection
|
|
binary_timeout: Timeout for binary detection
|
|
|
|
Returns:
|
|
Dictionary with diagnostic information
|
|
"""
|
|
diagnostic_info:dict[str, Any] = {
|
|
"binary_detection": None,
|
|
"remote_detection": None,
|
|
"chrome_136_plus_detected": False,
|
|
"configuration_valid": True,
|
|
"recommendations": []
|
|
}
|
|
|
|
# Try binary detection
|
|
if binary_path:
|
|
version_info = detect_chrome_version_from_binary(binary_path, timeout = binary_timeout)
|
|
if version_info:
|
|
diagnostic_info["binary_detection"] = {
|
|
"version_string": version_info.version_string,
|
|
"major_version": version_info.major_version,
|
|
"browser_name": version_info.browser_name,
|
|
"is_chrome_136_plus": version_info.is_chrome_136_plus
|
|
}
|
|
diagnostic_info["chrome_136_plus_detected"] = version_info.is_chrome_136_plus
|
|
|
|
# Try remote debugging detection
|
|
if remote_port:
|
|
version_info = detect_chrome_version_from_remote_debugging(remote_host, remote_port, timeout = remote_timeout)
|
|
if version_info:
|
|
diagnostic_info["remote_detection"] = {
|
|
"version_string": version_info.version_string,
|
|
"major_version": version_info.major_version,
|
|
"browser_name": version_info.browser_name,
|
|
"is_chrome_136_plus": version_info.is_chrome_136_plus
|
|
}
|
|
diagnostic_info["chrome_136_plus_detected"] = version_info.is_chrome_136_plus
|
|
|
|
# Add recommendations based on detected version
|
|
if diagnostic_info["chrome_136_plus_detected"]:
|
|
diagnostic_info["recommendations"].append(
|
|
"Chrome 136+ detected - ensure --user-data-dir is configured for remote debugging"
|
|
)
|
|
|
|
return diagnostic_info
|