mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
feat: collect timeout timing sessions for diagnostics (#814)
This commit is contained in:
@@ -272,7 +272,7 @@ Path resolution rules:
|
|||||||
|
|
||||||
- Runtime files are mode-dependent write locations (for example, logfile, update state, browser profile/cache, diagnostics, and downloaded ads).
|
- Runtime files are mode-dependent write locations (for example, logfile, update state, browser profile/cache, diagnostics, and downloaded ads).
|
||||||
- `--config` selects only the config file; it does not silently switch workspace mode.
|
- `--config` selects only the config file; it does not silently switch workspace mode.
|
||||||
- `--workspace-mode=portable`: runtime files are rooted next to the active config file (or the current working directory if no `--config` is supplied).
|
- `--workspace-mode=portable`: runtime files are placed in the same directory as the active config file (or the current working directory if no `--config` is supplied).
|
||||||
- `--workspace-mode=xdg`: runtime files use OS-standard user directories.
|
- `--workspace-mode=xdg`: runtime files use OS-standard user directories.
|
||||||
- `--config` without `--workspace-mode`: mode is inferred from existing footprints; on ambiguity/unknown, the command fails with guidance (for example: `Could not infer workspace mode for --config ...`) and asks you to rerun with `--workspace-mode=portable` or `--workspace-mode=xdg`.
|
- `--config` without `--workspace-mode`: mode is inferred from existing footprints; on ambiguity/unknown, the command fails with guidance (for example: `Could not infer workspace mode for --config ...`) and asks you to rerun with `--workspace-mode=portable` or `--workspace-mode=xdg`.
|
||||||
|
|
||||||
@@ -280,7 +280,7 @@ Examples:
|
|||||||
|
|
||||||
- `kleinanzeigen-bot --config /sync/dropbox/config1.yaml verify` (no `--workspace-mode`): mode is inferred from detected footprints; if both portable and user-directories footprints are found (or none are found), the command fails and lists the found paths.
|
- `kleinanzeigen-bot --config /sync/dropbox/config1.yaml verify` (no `--workspace-mode`): mode is inferred from detected footprints; if both portable and user-directories footprints are found (or none are found), the command fails and lists the found paths.
|
||||||
- `kleinanzeigen-bot --workspace-mode=portable --config /sync/dropbox/config1.yaml verify`: runtime files are rooted at `/sync/dropbox/` (for example `/sync/dropbox/.temp/` and `/sync/dropbox/downloaded-ads/`).
|
- `kleinanzeigen-bot --workspace-mode=portable --config /sync/dropbox/config1.yaml verify`: runtime files are rooted at `/sync/dropbox/` (for example `/sync/dropbox/.temp/` and `/sync/dropbox/downloaded-ads/`).
|
||||||
- `kleinanzeigen-bot --workspace-mode=xdg --config /sync/dropbox/config1.yaml verify`: config is read from `/sync/dropbox/config1.yaml`, while runtime files stay in user directories (for example Linux `~/.config/kleinanzeigen-bot/`, `~/.local/state/kleinanzeigen-bot/`, `~/.cache/kleinanzeigen-bot/`).
|
- `kleinanzeigen-bot --workspace-mode=xdg --config /sync/dropbox/config1.yaml verify`: config is read from `/sync/dropbox/config1.yaml`, while runtime files stay in user directories (on Linux: `~/.config/kleinanzeigen-bot/`, `~/.local/state/kleinanzeigen-bot/`, `~/.cache/kleinanzeigen-bot/`).
|
||||||
|
|
||||||
1. **Portable mode (recommended for most users, especially on Windows):**
|
1. **Portable mode (recommended for most users, especially on Windows):**
|
||||||
|
|
||||||
@@ -296,11 +296,11 @@ Examples:
|
|||||||
|
|
||||||
**OS notes (brief):**
|
**OS notes (brief):**
|
||||||
|
|
||||||
- **Windows:** User directories mode uses AppData (Roaming/Local); portable keeps everything beside the `.exe`.
|
- **Windows:** User directories mode uses AppData (Roaming/Local); portable keeps everything alongside the `.exe`.
|
||||||
- **Linux:** User directories mode uses `~/.config/kleinanzeigen-bot/config.yaml`, `~/.local/state/kleinanzeigen-bot/`, and `~/.cache/kleinanzeigen-bot/`; portable uses `./config.yaml`, `./.temp/`, and `./downloaded-ads/`.
|
- **Linux:** User directories mode uses `~/.config/kleinanzeigen-bot/config.yaml`, `~/.local/state/kleinanzeigen-bot/`, and `~/.cache/kleinanzeigen-bot/`; portable uses `./config.yaml`, `./.temp/`, and `./downloaded-ads/`.
|
||||||
- **macOS:** User directories mode uses `~/Library/Application Support/kleinanzeigen-bot/config.yaml` (config), `~/Library/Application Support/kleinanzeigen-bot/` (state/runtime), and `~/Library/Caches/kleinanzeigen-bot/` (cache/diagnostics); portable stays in the current working directory.
|
- **macOS:** User directories mode uses `~/Library/Application Support/kleinanzeigen-bot/config.yaml` (config), `~/Library/Application Support/kleinanzeigen-bot/` (state/runtime), and `~/Library/Caches/kleinanzeigen-bot/` (cache/diagnostics); portable stays in the current working directory.
|
||||||
|
|
||||||
If you have mixed legacy footprints (portable + XDG), pass an explicit mode (for example `--workspace-mode=portable`) and then clean up unused files. See [Configuration: Installation Modes](docs/CONFIGURATION.md#installation-modes).
|
If you have footprints from both modes (portable + XDG), pass an explicit mode (for example `--workspace-mode=portable`) and then clean up unused files. See [Configuration: Installation Modes](docs/CONFIGURATION.md#installation-modes).
|
||||||
|
|
||||||
### <a name="main-config"></a>1) Main configuration ⚙️
|
### <a name="main-config"></a>1) Main configuration ⚙️
|
||||||
|
|
||||||
|
|||||||
@@ -78,6 +78,11 @@ The bot will also provide specific instructions on how to fix your configuration
|
|||||||
1. Override specific keys under `timeouts` (e.g., `pagination_initial: 20.0`) if only a single selector is problematic.
|
1. Override specific keys under `timeouts` (e.g., `pagination_initial: 20.0`) if only a single selector is problematic.
|
||||||
1. For slow email verification prompts, raise `timeouts.email_verification`.
|
1. For slow email verification prompts, raise `timeouts.email_verification`.
|
||||||
1. Keep `retry_enabled` on so that DOM lookups are retried with exponential backoff.
|
1. Keep `retry_enabled` on so that DOM lookups are retried with exponential backoff.
|
||||||
|
1. Attach `timing_data.json` when opening issues so maintainers can tune defaults from real-world timing evidence.
|
||||||
|
- It is written automatically during runs when `diagnostics.timing_collection` is enabled (default: `true`, see `CONFIGURATION.md`).
|
||||||
|
- Portable mode path: `./.temp/timing/timing_data.json`
|
||||||
|
- User directories mode path: `~/.cache/kleinanzeigen-bot/timing/timing_data.json` (Linux), `~/Library/Caches/kleinanzeigen-bot/timing/timing_data.json` (macOS), or `%LOCALAPPDATA%\kleinanzeigen-bot\timing\timing_data.json` (Windows)
|
||||||
|
- Which one applies depends on your installation mode: portable mode writes next to your config/current directory, user directories mode writes in OS-standard user paths. Check which path exists on your system, or see `CONFIGURATION.md#installation-modes` for mode selection details.
|
||||||
|
|
||||||
### Issue: Bot fails to detect existing login session
|
### Issue: Bot fails to detect existing login session
|
||||||
|
|
||||||
|
|||||||
@@ -262,6 +262,7 @@ diagnostics:
|
|||||||
publish: false # Capture screenshot + HTML + JSON on each failed publish attempt (timeouts/protocol errors)
|
publish: false # Capture screenshot + HTML + JSON on each failed publish attempt (timeouts/protocol errors)
|
||||||
capture_log_copy: false # Copy entire bot log file when diagnostics are captured (may duplicate log content)
|
capture_log_copy: false # Copy entire bot log file when diagnostics are captured (may duplicate log content)
|
||||||
pause_on_login_detection_failure: false # Pause for manual inspection (interactive only)
|
pause_on_login_detection_failure: false # Pause for manual inspection (interactive only)
|
||||||
|
timing_collection: true # Collect timeout timing data locally for troubleshooting and tuning
|
||||||
output_dir: "" # Custom output directory (see "Output locations (default)" below)
|
output_dir: "" # Custom output directory (see "Output locations (default)" below)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -309,7 +310,44 @@ The bot uses a layered approach to detect login state, prioritizing stealth over
|
|||||||
- **User directories mode**: `~/.cache/kleinanzeigen-bot/diagnostics/` (Linux), `~/Library/Caches/kleinanzeigen-bot/diagnostics/` (macOS), or `%LOCALAPPDATA%\kleinanzeigen-bot\Cache\diagnostics\` (Windows)
|
- **User directories mode**: `~/.cache/kleinanzeigen-bot/diagnostics/` (Linux), `~/Library/Caches/kleinanzeigen-bot/diagnostics/` (macOS), or `%LOCALAPPDATA%\kleinanzeigen-bot\Cache\diagnostics\` (Windows)
|
||||||
- **Custom**: Path resolved relative to your `config.yaml` if `output_dir` is specified
|
- **Custom**: Path resolved relative to your `config.yaml` if `output_dir` is specified
|
||||||
|
|
||||||
> **⚠️ PII Warning:** HTML dumps, JSON payloads, and log copies may contain PII. Typical examples include account email, ad titles/descriptions, contact info, and prices. Log copies are produced by `capture_log_copy` when diagnostics capture runs, such as `capture_on.publish` or `capture_on.login_detection`. Review or redact these artifacts before sharing them publicly.
|
**Timing collection output (default):**
|
||||||
|
|
||||||
|
- **Portable mode**: `./.temp/timing/timing_data.json`
|
||||||
|
- **User directories mode**: `~/.cache/kleinanzeigen-bot/timing/timing_data.json` (Linux) or `~/Library/Caches/kleinanzeigen-bot/timing/timing_data.json` (macOS)
|
||||||
|
- Data is grouped by run/session and retained for 30 days via automatic cleanup during each data write
|
||||||
|
|
||||||
|
Example structure:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"session_id": "abc12345",
|
||||||
|
"command": "publish",
|
||||||
|
"started_at": "2026-02-07T10:00:00+01:00",
|
||||||
|
"ended_at": "2026-02-07T10:04:30+01:00",
|
||||||
|
"records": [
|
||||||
|
{
|
||||||
|
"operation_key": "default",
|
||||||
|
"operation_type": "web_find",
|
||||||
|
"effective_timeout_sec": 5.0,
|
||||||
|
"actual_duration_sec": 1.2,
|
||||||
|
"attempt_index": 0,
|
||||||
|
"success": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
How to read it quickly:
|
||||||
|
|
||||||
|
- Group by `command` and `session_id` first to compare slow vs fast runs
|
||||||
|
- Look for high `actual_duration_sec` values near `effective_timeout_sec` and repeated `success: false` entries
|
||||||
|
- `attempt_index` is zero-based (`0` first attempt, `1` first retry)
|
||||||
|
- Use `operation_key` + `operation_type` to identify which timeout bucket (`default`, `page_load`, etc.) needs tuning
|
||||||
|
- For deeper timeout tuning workflow, see [Browser Troubleshooting](./BROWSER_TROUBLESHOOTING.md)
|
||||||
|
|
||||||
|
> **⚠️ PII Warning:** HTML dumps, JSON payloads, timing data JSON files (for example `timing_data.json`), and log copies may contain PII. Typical examples include account email, ad titles/descriptions, contact info, and prices. Log copies are produced by `capture_log_copy` when diagnostics capture runs, such as `capture_on.publish` or `capture_on.login_detection`. Review or redact these artifacts before sharing them publicly.
|
||||||
|
|
||||||
## Installation Modes
|
## Installation Modes
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from .utils.exceptions import CaptchaEncountered
|
|||||||
from .utils.files import abspath
|
from .utils.files import abspath
|
||||||
from .utils.i18n import Locale, get_current_locale, pluralize, set_current_locale
|
from .utils.i18n import Locale, get_current_locale, pluralize, set_current_locale
|
||||||
from .utils.misc import ainput, ensure, is_frozen
|
from .utils.misc import ainput, ensure, is_frozen
|
||||||
|
from .utils.timing_collector import TimingCollector
|
||||||
from .utils.web_scraping_mixin import By, Element, Is, WebScrapingMixin
|
from .utils.web_scraping_mixin import By, Element, Is, WebScrapingMixin
|
||||||
|
|
||||||
# W0406: possibly a bug, see https://github.com/PyCQA/pylint/issues/3933
|
# W0406: possibly a bug, see https://github.com/PyCQA/pylint/issues/3933
|
||||||
@@ -179,13 +180,14 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
|||||||
self._log_basename = os.path.splitext(os.path.basename(sys.executable))[0] if is_frozen() else self.__module__
|
self._log_basename = os.path.splitext(os.path.basename(sys.executable))[0] if is_frozen() else self.__module__
|
||||||
self.log_file_path:str | None = abspath(f"{self._log_basename}.log")
|
self.log_file_path:str | None = abspath(f"{self._log_basename}.log")
|
||||||
self._logfile_arg:str | None = None
|
self._logfile_arg:str | None = None
|
||||||
self._logfile_explicitly_provided = False
|
self._logfile_explicitly_provided:bool = False
|
||||||
|
|
||||||
self.command = "help"
|
self.command = "help"
|
||||||
self.ads_selector = "due"
|
self.ads_selector = "due"
|
||||||
self.keep_old_ads = False
|
self.keep_old_ads = False
|
||||||
|
|
||||||
self._login_detection_diagnostics_captured:bool = False
|
self._login_detection_diagnostics_captured:bool = False
|
||||||
|
self._timing_collector:TimingCollector | None = None
|
||||||
|
|
||||||
def __del__(self) -> None:
|
def __del__(self) -> None:
|
||||||
if self.file_log:
|
if self.file_log:
|
||||||
@@ -393,6 +395,12 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
|||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
finally:
|
finally:
|
||||||
self.close_browser_session()
|
self.close_browser_session()
|
||||||
|
if self._timing_collector is not None:
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
await loop.run_in_executor(None, self._timing_collector.flush)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
LOG.warning("Timing collector flush failed: %s", exc)
|
||||||
|
|
||||||
def show_help(self) -> None:
|
def show_help(self) -> None:
|
||||||
if is_frozen():
|
if is_frozen():
|
||||||
@@ -613,6 +621,13 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
|||||||
config_yaml = dicts.load_dict_if_exists(self.config_file_path, _("config"))
|
config_yaml = dicts.load_dict_if_exists(self.config_file_path, _("config"))
|
||||||
self.config = Config.model_validate(config_yaml, strict = True, context = self.config_file_path)
|
self.config = Config.model_validate(config_yaml, strict = True, context = self.config_file_path)
|
||||||
|
|
||||||
|
timing_enabled = self.config.diagnostics.timing_collection
|
||||||
|
if timing_enabled and self.workspace:
|
||||||
|
timing_dir = self.workspace.diagnostics_dir.parent / "timing"
|
||||||
|
self._timing_collector = TimingCollector(timing_dir, self.command)
|
||||||
|
else:
|
||||||
|
self._timing_collector = None
|
||||||
|
|
||||||
# load built-in category mappings
|
# load built-in category mappings
|
||||||
self.categories = dicts.load_dict_from_module(resources, "categories.yaml", "")
|
self.categories = dicts.load_dict_from_module(resources, "categories.yaml", "")
|
||||||
LOG.debug("Loaded %s categories from categories.yaml", len(self.categories))
|
LOG.debug("Loaded %s categories from categories.yaml", len(self.categories))
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.browser = browser
|
self.browser = browser
|
||||||
self.config:Config = config
|
self.config:Config = config
|
||||||
self.download_dir = download_dir
|
self.download_dir:Path = download_dir
|
||||||
self.published_ads_by_id:dict[int, dict[str, Any]] = published_ads_by_id or {}
|
self.published_ads_by_id:dict[int, dict[str, Any]] = published_ads_by_id or {}
|
||||||
|
|
||||||
async def download_ad(self, ad_id:int) -> None:
|
async def download_ad(self, ad_id:int) -> None:
|
||||||
|
|||||||
@@ -265,6 +265,10 @@ class DiagnosticsConfig(ContextualModel):
|
|||||||
default = None,
|
default = None,
|
||||||
description = "Optional output directory for diagnostics artifacts. If omitted, a safe default is used based on installation mode.",
|
description = "Optional output directory for diagnostics artifacts. If omitted, a safe default is used based on installation mode.",
|
||||||
)
|
)
|
||||||
|
timing_collection:bool = Field(
|
||||||
|
default = True,
|
||||||
|
description = "If true, collect local timeout timing data and write it to diagnostics JSON for troubleshooting and tuning.",
|
||||||
|
)
|
||||||
|
|
||||||
@model_validator(mode = "before")
|
@model_validator(mode = "before")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@@ -261,6 +261,7 @@ kleinanzeigen_bot/__init__.py:
|
|||||||
"You provided no ads selector. Defaulting to \"new\".": "Es wurden keine Anzeigen-Selektor angegeben. Es wird \"new\" verwendet."
|
"You provided no ads selector. Defaulting to \"new\".": "Es wurden keine Anzeigen-Selektor angegeben. Es wird \"new\" verwendet."
|
||||||
"You provided no ads selector. Defaulting to \"changed\".": "Es wurden keine Anzeigen-Selektor angegeben. Es wird \"changed\" verwendet."
|
"You provided no ads selector. Defaulting to \"changed\".": "Es wurden keine Anzeigen-Selektor angegeben. Es wird \"changed\" verwendet."
|
||||||
"Unknown command: %s": "Unbekannter Befehl: %s"
|
"Unknown command: %s": "Unbekannter Befehl: %s"
|
||||||
|
"Timing collector flush failed: %s": "Zeitmessdaten konnten nicht gespeichert werden: %s"
|
||||||
|
|
||||||
fill_login_data_and_send:
|
fill_login_data_and_send:
|
||||||
"Logging in as [%s]...": "Anmeldung als [%s]..."
|
"Logging in as [%s]...": "Anmeldung als [%s]..."
|
||||||
@@ -527,6 +528,9 @@ kleinanzeigen_bot/utils/web_scraping_mixin.py:
|
|||||||
"Last page reached (no enabled 'Naechste' button found).": "Letzte Seite erreicht (kein aktivierter 'Naechste'-Button gefunden)."
|
"Last page reached (no enabled 'Naechste' button found).": "Letzte Seite erreicht (kein aktivierter 'Naechste'-Button gefunden)."
|
||||||
"No pagination controls found. Assuming last page.": "Keine Paginierungssteuerung gefunden. Es wird von der letzten Seite ausgegangen."
|
"No pagination controls found. Assuming last page.": "Keine Paginierungssteuerung gefunden. Es wird von der letzten Seite ausgegangen."
|
||||||
|
|
||||||
|
_record_timing:
|
||||||
|
"Timing collector failed for key=%s operation=%s: %s": "Zeitmessung fehlgeschlagen für key=%s operation=%s: %s"
|
||||||
|
|
||||||
close_browser_session:
|
close_browser_session:
|
||||||
"Closing Browser session...": "Schließe Browser-Sitzung..."
|
"Closing Browser session...": "Schließe Browser-Sitzung..."
|
||||||
|
|
||||||
@@ -685,6 +689,15 @@ kleinanzeigen_bot/utils/diagnostics.py:
|
|||||||
"Diagnostics capture attempted but no artifacts were saved (all captures failed)": "Diagnoseerfassung versucht, aber keine Artefakte gespeichert (alle Erfassungen fehlgeschlagen)"
|
"Diagnostics capture attempted but no artifacts were saved (all captures failed)": "Diagnoseerfassung versucht, aber keine Artefakte gespeichert (alle Erfassungen fehlgeschlagen)"
|
||||||
"Diagnostics capture failed: %s": "Diagnoseerfassung fehlgeschlagen: %s"
|
"Diagnostics capture failed: %s": "Diagnoseerfassung fehlgeschlagen: %s"
|
||||||
|
|
||||||
|
#################################################
|
||||||
|
kleinanzeigen_bot/utils/timing_collector.py:
|
||||||
|
#################################################
|
||||||
|
_load_existing_sessions:
|
||||||
|
"Unable to load timing collection data from %s: %s": "Zeitmessdaten aus %s konnten nicht geladen werden: %s"
|
||||||
|
|
||||||
|
flush:
|
||||||
|
"Failed to flush timing collection data: %s": "Zeitmessdaten konnten nicht gespeichert werden: %s"
|
||||||
|
|
||||||
#################################################
|
#################################################
|
||||||
kleinanzeigen_bot/utils/xdg_paths.py:
|
kleinanzeigen_bot/utils/xdg_paths.py:
|
||||||
#################################################
|
#################################################
|
||||||
|
|||||||
168
src/kleinanzeigen_bot/utils/timing_collector.py
Normal file
168
src/kleinanzeigen_bot/utils/timing_collector.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
# SPDX-FileCopyrightText: © Jens Bergmann and contributors
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||||
|
|
||||||
|
"""Collect per-operation timeout timings and persist per-run JSON sessions.
|
||||||
|
|
||||||
|
`TimingCollector` records operation durations in seconds, grouped by a single bot run
|
||||||
|
(`session_id`). Call `record(...)` during runtime and `flush()` once at command end to
|
||||||
|
append the current session to `timing_data.json` with automatic 30-day retention.
|
||||||
|
The collector is best-effort and designed for troubleshooting, not strict telemetry.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json, uuid # isort: skip
|
||||||
|
import os
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from datetime import timedelta
|
||||||
|
from typing import TYPE_CHECKING, Any, Final
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from kleinanzeigen_bot.utils import loggers, misc
|
||||||
|
|
||||||
|
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
|
||||||
|
|
||||||
|
RETENTION_DAYS:Final[int] = 30
|
||||||
|
TIMING_FILE:Final[str] = "timing_data.json"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TimingRecord:
|
||||||
|
timestamp:str
|
||||||
|
operation_key:str
|
||||||
|
operation_type:str
|
||||||
|
description:str
|
||||||
|
configured_timeout_sec:float
|
||||||
|
effective_timeout_sec:float
|
||||||
|
actual_duration_sec:float
|
||||||
|
attempt_index:int
|
||||||
|
success:bool
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return asdict(self)
|
||||||
|
|
||||||
|
|
||||||
|
class TimingCollector:
|
||||||
|
def __init__(self, output_dir:Path, command:str) -> None:
|
||||||
|
self.output_dir = output_dir.resolve()
|
||||||
|
self.command = command
|
||||||
|
self.session_id = uuid.uuid4().hex[:8]
|
||||||
|
self.started_at = misc.now().isoformat()
|
||||||
|
self.records:list[TimingRecord] = []
|
||||||
|
self._flushed = False
|
||||||
|
|
||||||
|
LOG.debug("Timing collection initialized (session=%s, output_dir=%s, command=%s)", self.session_id, self.output_dir, command)
|
||||||
|
|
||||||
|
def record(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
key:str,
|
||||||
|
operation_type:str,
|
||||||
|
description:str,
|
||||||
|
configured_timeout:float,
|
||||||
|
effective_timeout:float,
|
||||||
|
actual_duration:float,
|
||||||
|
attempt_index:int,
|
||||||
|
success:bool,
|
||||||
|
) -> None:
|
||||||
|
self.records.append(
|
||||||
|
TimingRecord(
|
||||||
|
timestamp = misc.now().isoformat(),
|
||||||
|
operation_key = key,
|
||||||
|
operation_type = operation_type,
|
||||||
|
description = description,
|
||||||
|
configured_timeout_sec = configured_timeout,
|
||||||
|
effective_timeout_sec = effective_timeout,
|
||||||
|
actual_duration_sec = actual_duration,
|
||||||
|
attempt_index = attempt_index,
|
||||||
|
success = success,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
LOG.debug(
|
||||||
|
"Timing captured: %s [%s] duration=%.3fs timeout=%.3fs success=%s",
|
||||||
|
operation_type,
|
||||||
|
key,
|
||||||
|
actual_duration,
|
||||||
|
effective_timeout,
|
||||||
|
success,
|
||||||
|
)
|
||||||
|
|
||||||
|
def flush(self) -> Path | None:
|
||||||
|
if self._flushed:
|
||||||
|
LOG.debug("Timing collection already flushed for this run")
|
||||||
|
return None
|
||||||
|
if not self.records:
|
||||||
|
LOG.debug("Timing collection enabled but no records captured in this run")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.output_dir.mkdir(parents = True, exist_ok = True)
|
||||||
|
data = self._load_existing_sessions()
|
||||||
|
data.append(
|
||||||
|
{
|
||||||
|
"session_id": self.session_id,
|
||||||
|
"command": self.command,
|
||||||
|
"started_at": self.started_at,
|
||||||
|
"ended_at": misc.now().isoformat(),
|
||||||
|
"records": [record.to_dict() for record in self.records],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
cutoff = misc.now() - timedelta(days = RETENTION_DAYS)
|
||||||
|
retained:list[dict[str, Any]] = []
|
||||||
|
dropped = 0
|
||||||
|
for session in data:
|
||||||
|
try:
|
||||||
|
parsed = misc.parse_datetime(session.get("started_at"), add_timezone_if_missing = True)
|
||||||
|
except ValueError:
|
||||||
|
parsed = None
|
||||||
|
if parsed is None:
|
||||||
|
dropped += 1
|
||||||
|
continue
|
||||||
|
if parsed >= cutoff:
|
||||||
|
retained.append(session)
|
||||||
|
else:
|
||||||
|
dropped += 1
|
||||||
|
|
||||||
|
if dropped > 0:
|
||||||
|
LOG.debug("Timing collection pruned %d old or malformed sessions", dropped)
|
||||||
|
|
||||||
|
output_file = self.output_dir / TIMING_FILE
|
||||||
|
temp_file = self.output_dir / f".{TIMING_FILE}.{self.session_id}.tmp"
|
||||||
|
with temp_file.open("w", encoding = "utf-8") as fd:
|
||||||
|
json.dump(retained, fd, indent = 2)
|
||||||
|
fd.write("\n")
|
||||||
|
fd.flush()
|
||||||
|
os.fsync(fd.fileno())
|
||||||
|
temp_file.replace(output_file)
|
||||||
|
|
||||||
|
LOG.debug(
|
||||||
|
"Timing collection flushed to %s (%d sessions, %d current records, retention=%d days)",
|
||||||
|
output_file,
|
||||||
|
len(retained),
|
||||||
|
len(self.records),
|
||||||
|
RETENTION_DAYS,
|
||||||
|
)
|
||||||
|
self.records = []
|
||||||
|
self._flushed = True
|
||||||
|
return output_file
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
LOG.warning("Failed to flush timing collection data: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _load_existing_sessions(self) -> list[dict[str, Any]]:
|
||||||
|
file_path = self.output_dir / TIMING_FILE
|
||||||
|
if not file_path.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with file_path.open(encoding = "utf-8") as fd:
|
||||||
|
payload = json.load(fd)
|
||||||
|
if isinstance(payload, list):
|
||||||
|
return [item for item in payload if isinstance(item, dict)]
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
LOG.warning("Unable to load timing collection data from %s: %s", file_path, exc)
|
||||||
|
return []
|
||||||
@@ -183,6 +183,36 @@ class WebScrapingMixin:
|
|||||||
# Always perform the initial attempt plus the configured number of retries.
|
# Always perform the initial attempt plus the configured number of retries.
|
||||||
return 1 + cfg.retry_max_attempts
|
return 1 + cfg.retry_max_attempts
|
||||||
|
|
||||||
|
def _record_timing(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
key:str,
|
||||||
|
description:str,
|
||||||
|
configured_timeout:float,
|
||||||
|
effective_timeout:float,
|
||||||
|
actual_duration:float,
|
||||||
|
attempt_index:int,
|
||||||
|
success:bool,
|
||||||
|
) -> None:
|
||||||
|
collector = getattr(self, "_timing_collector", None)
|
||||||
|
if collector is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
operation_type = description.split("(", 1)[0] if "(" in description else description
|
||||||
|
try:
|
||||||
|
collector.record(
|
||||||
|
key = key,
|
||||||
|
operation_type = operation_type,
|
||||||
|
description = description,
|
||||||
|
configured_timeout = configured_timeout,
|
||||||
|
effective_timeout = effective_timeout,
|
||||||
|
actual_duration = actual_duration,
|
||||||
|
attempt_index = attempt_index,
|
||||||
|
success = success,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
LOG.warning("Timing collector failed for key=%s operation=%s: %s", key, operation_type, exc)
|
||||||
|
|
||||||
async def _run_with_timeout_retries(
|
async def _run_with_timeout_retries(
|
||||||
self, operation:Callable[[float], Awaitable[T]], *, description:str, key:str = "default", override:float | None = None
|
self, operation:Callable[[float], Awaitable[T]], *, description:str, key:str = "default", override:float | None = None
|
||||||
) -> T:
|
) -> T:
|
||||||
@@ -190,12 +220,34 @@ class WebScrapingMixin:
|
|||||||
Execute an async callable with retry/backoff handling for TimeoutError.
|
Execute an async callable with retry/backoff handling for TimeoutError.
|
||||||
"""
|
"""
|
||||||
attempts = self._timeout_attempts()
|
attempts = self._timeout_attempts()
|
||||||
|
configured_timeout = self._timeout(key, override)
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
for attempt in range(attempts):
|
for attempt in range(attempts):
|
||||||
effective_timeout = self._effective_timeout(key, override, attempt = attempt)
|
effective_timeout = self._effective_timeout(key, override, attempt = attempt)
|
||||||
|
attempt_started = loop.time()
|
||||||
try:
|
try:
|
||||||
return await operation(effective_timeout)
|
result = await operation(effective_timeout)
|
||||||
|
self._record_timing(
|
||||||
|
key = key,
|
||||||
|
description = description,
|
||||||
|
configured_timeout = configured_timeout,
|
||||||
|
effective_timeout = effective_timeout,
|
||||||
|
actual_duration = loop.time() - attempt_started,
|
||||||
|
attempt_index = attempt,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
return result
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
|
self._record_timing(
|
||||||
|
key = key,
|
||||||
|
description = description,
|
||||||
|
configured_timeout = configured_timeout,
|
||||||
|
effective_timeout = effective_timeout,
|
||||||
|
actual_duration = loop.time() - attempt_started,
|
||||||
|
attempt_index = attempt,
|
||||||
|
success = False,
|
||||||
|
)
|
||||||
if attempt >= attempts - 1:
|
if attempt >= attempts - 1:
|
||||||
raise
|
raise
|
||||||
LOG.debug("Retrying %s after TimeoutError (attempt %d/%d, timeout %.1fs)", description, attempt + 1, attempts, effective_timeout)
|
LOG.debug("Retrying %s after TimeoutError (attempt %d/%d, timeout %.1fs)", description, attempt + 1, attempts, effective_timeout)
|
||||||
|
|||||||
@@ -96,12 +96,12 @@ def invoke_cli(
|
|||||||
set_current_locale(previous_locale)
|
set_current_locale(previous_locale)
|
||||||
|
|
||||||
|
|
||||||
def _xdg_env_overrides(tmp_path:Path) -> dict[str, str]:
|
def _xdg_env_overrides(base_path:Path) -> dict[str, str]:
|
||||||
"""Create temporary HOME/XDG environment overrides for isolated smoke test runs."""
|
"""Create temporary HOME/XDG environment overrides rooted at the provided base path."""
|
||||||
home = tmp_path / "home"
|
home = base_path / "home"
|
||||||
xdg_config = tmp_path / "xdg" / "config"
|
xdg_config = base_path / "xdg" / "config"
|
||||||
xdg_state = tmp_path / "xdg" / "state"
|
xdg_state = base_path / "xdg" / "state"
|
||||||
xdg_cache = tmp_path / "xdg" / "cache"
|
xdg_cache = base_path / "xdg" / "cache"
|
||||||
for path in (home, xdg_config, xdg_state, xdg_cache):
|
for path in (home, xdg_config, xdg_state, xdg_cache):
|
||||||
path.mkdir(parents = True, exist_ok = True)
|
path.mkdir(parents = True, exist_ok = True)
|
||||||
return {
|
return {
|
||||||
|
|||||||
204
tests/unit/test_timing_collector.py
Normal file
204
tests/unit/test_timing_collector.py
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
# SPDX-FileCopyrightText: © Jens Bergmann and contributors
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from kleinanzeigen_bot.utils import misc
|
||||||
|
from kleinanzeigen_bot.utils.timing_collector import RETENTION_DAYS, TimingCollector
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
class TestTimingCollector:
|
||||||
|
def test_output_dir_resolves_to_given_path(self, tmp_path:Path) -> None:
|
||||||
|
collector = TimingCollector(tmp_path / "xdg-cache" / "timing", "publish")
|
||||||
|
|
||||||
|
assert collector.output_dir == (tmp_path / "xdg-cache" / "timing").resolve()
|
||||||
|
|
||||||
|
def test_flush_writes_session_data(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "publish")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.4,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = collector.flush()
|
||||||
|
|
||||||
|
assert file_path is not None
|
||||||
|
assert file_path.exists()
|
||||||
|
|
||||||
|
data = json.loads(file_path.read_text(encoding = "utf-8"))
|
||||||
|
assert isinstance(data, list)
|
||||||
|
assert len(data) == 1
|
||||||
|
assert data[0]["command"] == "publish"
|
||||||
|
assert len(data[0]["records"]) == 1
|
||||||
|
assert data[0]["records"][0]["operation_key"] == "default"
|
||||||
|
|
||||||
|
def test_flush_prunes_old_and_malformed_sessions(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
output_dir = tmp_path / ".temp" / "timing"
|
||||||
|
output_dir.mkdir(parents = True, exist_ok = True)
|
||||||
|
data_path = output_dir / "timing_data.json"
|
||||||
|
|
||||||
|
old_started = (misc.now() - timedelta(days = RETENTION_DAYS + 1)).isoformat()
|
||||||
|
recent_started = (misc.now() - timedelta(days = 2)).isoformat()
|
||||||
|
|
||||||
|
existing_payload = [
|
||||||
|
{
|
||||||
|
"session_id": "old-session",
|
||||||
|
"command": "publish",
|
||||||
|
"started_at": old_started,
|
||||||
|
"ended_at": old_started,
|
||||||
|
"records": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"session_id": "recent-session",
|
||||||
|
"command": "publish",
|
||||||
|
"started_at": recent_started,
|
||||||
|
"ended_at": recent_started,
|
||||||
|
"records": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"session_id": "malformed-session",
|
||||||
|
"command": "publish",
|
||||||
|
"started_at": "not-a-datetime",
|
||||||
|
"ended_at": "not-a-datetime",
|
||||||
|
"records": [],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
data_path.write_text(json.dumps(existing_payload), encoding = "utf-8")
|
||||||
|
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "verify")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.2,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = collector.flush()
|
||||||
|
|
||||||
|
assert file_path is not None
|
||||||
|
data = json.loads(file_path.read_text(encoding = "utf-8"))
|
||||||
|
session_ids = [session["session_id"] for session in data]
|
||||||
|
assert "old-session" not in session_ids
|
||||||
|
assert "malformed-session" not in session_ids
|
||||||
|
assert "recent-session" in session_ids
|
||||||
|
assert collector.session_id in session_ids
|
||||||
|
|
||||||
|
def test_flush_returns_none_when_already_flushed(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "publish")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.1,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
first = collector.flush()
|
||||||
|
second = collector.flush()
|
||||||
|
|
||||||
|
assert first is not None
|
||||||
|
assert second is None
|
||||||
|
|
||||||
|
def test_flush_returns_none_when_no_records(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "publish")
|
||||||
|
|
||||||
|
assert collector.flush() is None
|
||||||
|
|
||||||
|
def test_flush_recovers_from_corrupted_json(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
output_dir = tmp_path / ".temp" / "timing"
|
||||||
|
output_dir.mkdir(parents = True, exist_ok = True)
|
||||||
|
data_path = output_dir / "timing_data.json"
|
||||||
|
data_path.write_text("{ this is invalid json", encoding = "utf-8")
|
||||||
|
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "verify")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.1,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = collector.flush()
|
||||||
|
|
||||||
|
assert file_path is not None
|
||||||
|
payload = json.loads(file_path.read_text(encoding = "utf-8"))
|
||||||
|
assert isinstance(payload, list)
|
||||||
|
assert len(payload) == 1
|
||||||
|
assert payload[0]["session_id"] == collector.session_id
|
||||||
|
|
||||||
|
def test_flush_ignores_non_list_payload(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
|
||||||
|
output_dir = tmp_path / ".temp" / "timing"
|
||||||
|
output_dir.mkdir(parents = True, exist_ok = True)
|
||||||
|
data_path = output_dir / "timing_data.json"
|
||||||
|
data_path.write_text(json.dumps({"unexpected": "shape"}), encoding = "utf-8")
|
||||||
|
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "verify")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.1,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
file_path = collector.flush()
|
||||||
|
|
||||||
|
assert file_path is not None
|
||||||
|
payload = json.loads(file_path.read_text(encoding = "utf-8"))
|
||||||
|
assert isinstance(payload, list)
|
||||||
|
assert len(payload) == 1
|
||||||
|
assert payload[0]["session_id"] == collector.session_id
|
||||||
|
|
||||||
|
def test_flush_returns_none_when_write_raises(self, tmp_path:Path, monkeypatch:pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.chdir(tmp_path)
|
||||||
|
collector = TimingCollector(tmp_path / ".temp" / "timing", "verify")
|
||||||
|
collector.record(
|
||||||
|
key = "default",
|
||||||
|
operation_type = "web_find",
|
||||||
|
description = "web_find(ID, submit)",
|
||||||
|
configured_timeout = 5.0,
|
||||||
|
effective_timeout = 5.0,
|
||||||
|
actual_duration = 0.1,
|
||||||
|
attempt_index = 0,
|
||||||
|
success = True,
|
||||||
|
)
|
||||||
|
|
||||||
|
with patch.object(Path, "mkdir", side_effect = OSError("cannot create dir")):
|
||||||
|
assert collector.flush() is None
|
||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user