fix: eliminate async safety violations and migrate to pathlib (#697)

## ℹ️ Description Eliminate all blocking I/O operations in async contexts and modernize file path handling by migrating from os.path to pathlib.Path. - Link to the related issue(s): #692 - Get rid of the TODO in pyproject.toml - The added debug logging will ease the troubleshooting for path related issues. ## 📋 Changes Summary - Enable ASYNC210, ASYNC230, ASYNC240, ASYNC250 Ruff rules - Wrap blocking urllib.request.urlopen() in run_in_executor - Wrap blocking file operations (open, write) in run_in_executor - Replace blocking os.path calls with async helpers using run_in_executor - Replace blocking input() with await ainput() - Migrate extract.py from os.path to pathlib.Path - Use Path() constructor and / operator for path joining - Use Path.mkdir(), Path.rename() in executor instead of os functions - Create mockable _path_exists() and _path_is_dir() helpers - Add debug logging for all file system operations ### ⚙️ Type of Change Select the type(s) of change(s) included in this pull request: - [X] 🐞 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist Before requesting a review, confirm the following: - [X] I have reviewed my changes to ensure they meet the project's standards. - [X] I have tested my changes and ensured that all tests pass (`pdm run test`). - [X] I have formatted the code (`pdm run format`). - [X] I have verified that linting passes (`pdm run lint`). - [X] I have updated documentation where necessary. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.  ## Summary by CodeRabbit * **Refactor** * Made user prompt non‑blocking to improve responsiveness. * Converted filesystem/path handling and prefs I/O to async‑friendly operations; moved blocking network and file work to background tasks. * Added async file/path helpers and async port‑check before browser connections. * **Tests** * Expanded unit tests for path helpers, image download success/failure, prefs writing, and directory creation/renaming workflows. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
2026-03-12 10:31:50 +01:00 · 2025-12-05 20:53:40 +01:00
parent 6cbc25b54c
commit 220c01f257
9 changed files with 527 additions and 303 deletions
--- a/src/kleinanzeigen_bot/init.py
+++ b/src/kleinanzeigen_bot/init.py
@@ -937,7 +937,7 @@ class KleinanzeigenBot(WebScrapingMixin):
            LOG.warning("# Payment form detected! Please proceed with payment.")
            LOG.warning("############################################")
            await self.web_scroll_page_down()
-            input(_("Press a key to continue..."))
+            await ainput(_("Press a key to continue..."))
        except TimeoutError:
            pass

@@ -1108,7 +1108,7 @@ class KleinanzeigenBot(WebScrapingMixin):
                    # in some categories we need to go another dialog back
                    try:
                        await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]',
-                                            timeout=short_timeout)
+                                            timeout = short_timeout)
                    except TimeoutError:
                        await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]')

--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -1,18 +1,21 @@
 # SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
+import asyncio
 from gettext import gettext as _

-import json, mimetypes, os, re, shutil  # isort: skip
+import json, mimetypes, re, shutil  # isort: skip
+import urllib.error as urllib_error
 import urllib.request as urllib_request
 from datetime import datetime
+from pathlib import Path
 from typing import Any, Final

 from kleinanzeigen_bot.model.ad_model import ContactPartial

 from .model.ad_model import AdPartial
 from .model.config_model import Config
-from .utils import dicts, i18n, loggers, misc, reflect
+from .utils import dicts, files, i18n, loggers, misc, reflect
 from .utils.web_scraping_mixin import Browser, By, Element, WebScrapingMixin

 __all__ = [
@@ -44,23 +47,39 @@ class AdExtractor(WebScrapingMixin):
        """

        # create sub-directory for ad(s) to download (if necessary):
-        relative_directory = "downloaded-ads"
-        # make sure configured base directory exists
-        if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
-            os.mkdir(relative_directory)
-            LOG.info("Created ads directory at ./%s.", relative_directory)
+        relative_directory = Path("downloaded-ads")
+        # make sure configured base directory exists (using exist_ok=True to avoid TOCTOU race)
+        await asyncio.get_running_loop().run_in_executor(None, lambda: relative_directory.mkdir(exist_ok = True))  # noqa: ASYNC240
+        LOG.info("Ensured ads directory exists at ./%s.", relative_directory)

        # Extract ad info and determine final directory path
        ad_cfg, final_dir = await self._extract_ad_page_info_with_directory_handling(
            relative_directory, ad_id
        )

-        # Save the ad configuration file
-        ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml"
-        dicts.save_dict(
-            ad_file_path,
-            ad_cfg.model_dump(),
-            header = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json")
+        # Save the ad configuration file (offload to executor to avoid blocking the event loop)
+        ad_file_path = str(Path(final_dir) / f"ad_{ad_id}.yaml")
+        header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
+        await asyncio.get_running_loop().run_in_executor(
+            None,
+            lambda: dicts.save_dict(ad_file_path, ad_cfg.model_dump(), header = header_string)
+        )
+
+    @staticmethod
+    def _download_and_save_image_sync(url:str, directory:str, filename_prefix:str, img_nr:int) -> str | None:
+        try:
+            with urllib_request.urlopen(url) as response:  # noqa: S310 Audit URL open for permitted schemes.
+                content_type = response.info().get_content_type()
+                file_ending = mimetypes.guess_extension(content_type) or ""
+                # Use pathlib.Path for OS-agnostic path handling
+                img_path = Path(directory) / f"{filename_prefix}{img_nr}{file_ending}"
+                with open(img_path, "wb") as f:
+                    shutil.copyfileobj(response, f)
+                return str(img_path)
+        except (urllib_error.URLError, urllib_error.HTTPError, OSError, shutil.Error) as e:
+            # Narrow exception handling to expected network/filesystem errors
+            LOG.warning("Failed to download image %s: %s", url, e)
+            return None

    async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[str]:
        """
@@ -85,19 +104,26 @@ class AdExtractor(WebScrapingMixin):
            img_nr = 1
            dl_counter = 0

+            loop = asyncio.get_running_loop()
+
            for img_element in images:
                current_img_url = img_element.attrs["src"]  # URL of the image
                if current_img_url is None:
                    continue

-                with urllib_request.urlopen(str(current_img_url)) as response:  # noqa: S310 Audit URL open for permitted schemes.
-                    content_type = response.info().get_content_type()
-                    file_ending = mimetypes.guess_extension(content_type)
-                    img_path = f"{directory}/{img_fn_prefix}{img_nr}{file_ending}"
-                    with open(img_path, "wb") as f:
-                        shutil.copyfileobj(response, f)
+                img_path = await loop.run_in_executor(
+                    None,
+                    self._download_and_save_image_sync,
+                    str(current_img_url),
+                    directory,
+                    img_fn_prefix,
+                    img_nr
+                )
+
+                if img_path:
                    dl_counter += 1
-                    img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1])
+                    # Use pathlib.Path for OS-agnostic path handling
+                    img_paths.append(Path(img_path).name)

                img_nr += 1
            LOG.info("Downloaded %s.", i18n.pluralize("image", dl_counter))
@@ -354,8 +380,8 @@ class AdExtractor(WebScrapingMixin):
        return ad_cfg

    async def _extract_ad_page_info_with_directory_handling(
-        self, relative_directory:str, ad_id:int
-    ) -> tuple[AdPartial, str]:
+        self, relative_directory:Path, ad_id:int
+    ) -> tuple[AdPartial, Path]:
        """
        Extracts ad information and handles directory creation/renaming.

@@ -373,32 +399,37 @@ class AdExtractor(WebScrapingMixin):

        # Determine the final directory path
        sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length)
-        final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
-        temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
+        final_dir = relative_directory / f"ad_{ad_id}_{sanitized_title}"
+        temp_dir = relative_directory / f"ad_{ad_id}"
+
+        loop = asyncio.get_running_loop()

        # Handle existing directories
-        if os.path.exists(final_dir):
+        if await files.exists(final_dir):
            # If the folder with title already exists, delete it
            LOG.info("Deleting current folder of ad %s...", ad_id)
-            shutil.rmtree(final_dir)
+            LOG.debug("Removing directory tree: %s", final_dir)
+            await loop.run_in_executor(None, shutil.rmtree, str(final_dir))

-        if os.path.exists(temp_dir):
+        if await files.exists(temp_dir):
            if self.config.download.rename_existing_folders:
                # Rename the old folder to the new name with title
                LOG.info("Renaming folder from %s to %s for ad %s...",
-                        os.path.basename(temp_dir), os.path.basename(final_dir), ad_id)
-                os.rename(temp_dir, final_dir)
+                        temp_dir.name, final_dir.name, ad_id)
+                LOG.debug("Renaming: %s -> %s", temp_dir, final_dir)
+                await loop.run_in_executor(None, temp_dir.rename, final_dir)
            else:
                # Use the existing folder without renaming
                final_dir = temp_dir
                LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir)
        else:
            # Create new directory with title
-            os.mkdir(final_dir)
+            LOG.debug("Creating new directory: %s", final_dir)
+            await loop.run_in_executor(None, final_dir.mkdir)
            LOG.info("New directory for ad created at %s.", final_dir)

        # Now extract complete ad info (including images) to the final directory
-        ad_cfg = await self._extract_ad_page_info(final_dir, ad_id)
+        ad_cfg = await self._extract_ad_page_info(str(final_dir), ad_id)

        return ad_cfg, final_dir

--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -173,7 +173,10 @@ kleinanzeigen_bot/__init__.py:
 kleinanzeigen_bot/extract.py:
 #################################################
  download_ad:
-    "Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
+    "Ensured ads directory exists at ./%s.": "Verzeichnis [%s] für Anzeige vorhanden."
+
+  _download_and_save_image_sync:
+    "Failed to download image %s: %s": "Fehler beim Herunterladen des Bildes %s: %s"

  _download_images_from_ad_page:
    "Found %s.": "%s gefunden."
--- a/src/kleinanzeigen_bot/utils/files.py
+++ b/src/kleinanzeigen_bot/utils/files.py
@@ -1,7 +1,8 @@
 # SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
 # SPDX-License-Identifier: AGPL-3.0-or-later
 # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
-import os
+import asyncio, os  # isort: skip
+from pathlib import Path


 def abspath(relative_path:str, relative_to:str | None = None) -> str:
@@ -24,3 +25,23 @@ def abspath(relative_path:str, relative_to:str | None = None) -> str:
        base = os.path.dirname(base)

    return os.path.normpath(os.path.join(base, relative_path))
+
+
+async def exists(path:str | Path) -> bool:
+    """
+    Asynchronously check if a file or directory exists.
+
+    :param path: Path to check
+    :return: True if path exists, False otherwise
+    """
+    return await asyncio.get_running_loop().run_in_executor(None, Path(path).exists)
+
+
+async def is_dir(path:str | Path) -> bool:
+    """
+    Asynchronously check if a path is a directory.
+
+    :param path: Path to check
+    :return: True if path is a directory, False otherwise
+    """
+    return await asyncio.get_running_loop().run_in_executor(None, Path(path).is_dir)
--- a/src/kleinanzeigen_bot/utils/web_scraping_mixin.py
+++ b/src/kleinanzeigen_bot/utils/web_scraping_mixin.py
@@ -22,7 +22,7 @@ from nodriver.core.tab import Tab as Page
 from kleinanzeigen_bot.model.config_model import Config as BotConfig
 from kleinanzeigen_bot.model.config_model import TimeoutConfig

-from . import loggers, net
+from . import files, loggers, net
 from .chrome_version_detector import (
    ChromeVersionInfo,
    detect_chrome_version_from_binary,
@@ -100,6 +100,37 @@ class BrowserConfig:
        self.profile_name:str | None = None


+def _write_initial_prefs(prefs_file:str) -> None:
+    with open(prefs_file, "w", encoding = "UTF-8") as fd:
+        json.dump({
+            "credentials_enable_service": False,
+            "enable_do_not_track": True,
+            "google": {
+                "services": {
+                    "consented_to_sync": False
+                }
+            },
+            "profile": {
+                "default_content_setting_values": {
+                    "popups": 0,
+                    "notifications": 2  # 1 = allow, 2 = block browser notifications
+                },
+                "password_manager_enabled": False
+            },
+            "signin": {
+                "allowed": False
+            },
+            "translate_site_blacklist": [
+                "www.kleinanzeigen.de"
+            ],
+            "devtools": {
+                "preferences": {
+                    "currentDockState": '"bottom"'
+                }
+            }
+        }, fd)
+
+
 class WebScrapingMixin:

    def __init__(self) -> None:
@@ -174,7 +205,7 @@ class WebScrapingMixin:
        LOG.info("Creating Browser session...")

        if self.browser_config.binary_location:
-            ensure(os.path.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
+            ensure(await files.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
        else:
            self.browser_config.binary_location = self.get_compatible_browser()
        LOG.info(" -> Browser binary location: %s", self.browser_config.binary_location)
@@ -289,41 +320,14 @@ class WebScrapingMixin:
            profile_dir = os.path.join(cfg.user_data_dir, self.browser_config.profile_name or "Default")
            os.makedirs(profile_dir, exist_ok = True)
            prefs_file = os.path.join(profile_dir, "Preferences")
-            if not os.path.exists(prefs_file):
+            if not await files.exists(prefs_file):
                LOG.info(" -> Setting chrome prefs [%s]...", prefs_file)
-                with open(prefs_file, "w", encoding = "UTF-8") as fd:
-                    json.dump({
-                        "credentials_enable_service": False,
-                        "enable_do_not_track": True,
-                        "google": {
-                            "services": {
-                                "consented_to_sync": False
-                            }
-                        },
-                        "profile": {
-                            "default_content_setting_values": {
-                                "popups": 0,
-                                "notifications": 2  # 1 = allow, 2 = block browser notifications
-                            },
-                            "password_manager_enabled": False
-                        },
-                        "signin": {
-                            "allowed": False
-                        },
-                        "translate_site_blacklist": [
-                            "www.kleinanzeigen.de"
-                        ],
-                        "devtools": {
-                            "preferences": {
-                                "currentDockState": '"bottom"'
-                            }
-                        }
-                    }, fd)
+                await asyncio.get_running_loop().run_in_executor(None, _write_initial_prefs, prefs_file)

        # load extensions
        for crx_extension in self.browser_config.extensions:
            LOG.info(" -> Adding Browser extension: [%s]", crx_extension)
-            ensure(os.path.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
+            ensure(await files.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
            cfg.add_extension(crx_extension)

        try: