fix: eliminate async safety violations and migrate to pathlib (#697)

## ℹ️ Description
Eliminate all blocking I/O operations in async contexts and modernize
file path handling by migrating from os.path to pathlib.Path.

- Link to the related issue(s): #692 
- Get rid of the TODO in pyproject.toml
- The added debug logging will ease the troubleshooting for path related
issues.

## 📋 Changes Summary

- Enable ASYNC210, ASYNC230, ASYNC240, ASYNC250 Ruff rules
- Wrap blocking urllib.request.urlopen() in run_in_executor
- Wrap blocking file operations (open, write) in run_in_executor
- Replace blocking os.path calls with async helpers using
run_in_executor
- Replace blocking input() with await ainput()
- Migrate extract.py from os.path to pathlib.Path
- Use Path() constructor and / operator for path joining
- Use Path.mkdir(), Path.rename() in executor instead of os functions
- Create mockable _path_exists() and _path_is_dir() helpers
- Add debug logging for all file system operations

### ⚙️ Type of Change
Select the type(s) of change(s) included in this pull request:
- [X] 🐞 Bug fix (non-breaking change which fixes an issue)
- [ ]  New feature (adds new functionality without breaking existing
usage)
- [ ] 💥 Breaking change (changes that might break existing user setups,
scripts, or configurations)


##  Checklist
Before requesting a review, confirm the following:
- [X] I have reviewed my changes to ensure they meet the project's
standards.
- [X] I have tested my changes and ensured that all tests pass (`pdm run
test`).
- [X] I have formatted the code (`pdm run format`).
- [X] I have verified that linting passes (`pdm run lint`).
- [X] I have updated documentation where necessary.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Refactor**
  * Made user prompt non‑blocking to improve responsiveness.
* Converted filesystem/path handling and prefs I/O to async‑friendly
operations; moved blocking network and file work to background tasks.
* Added async file/path helpers and async port‑check before browser
connections.

* **Tests**
* Expanded unit tests for path helpers, image download success/failure,
prefs writing, and directory creation/renaming workflows.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Jens
2025-12-05 20:53:40 +01:00
committed by GitHub
parent 6cbc25b54c
commit 220c01f257
9 changed files with 527 additions and 303 deletions

View File

@@ -937,7 +937,7 @@ class KleinanzeigenBot(WebScrapingMixin):
LOG.warning("# Payment form detected! Please proceed with payment.")
LOG.warning("############################################")
await self.web_scroll_page_down()
input(_("Press a key to continue..."))
await ainput(_("Press a key to continue..."))
except TimeoutError:
pass
@@ -1108,7 +1108,7 @@ class KleinanzeigenBot(WebScrapingMixin):
# in some categories we need to go another dialog back
try:
await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]',
timeout=short_timeout)
timeout = short_timeout)
except TimeoutError:
await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]')

View File

@@ -1,18 +1,21 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import asyncio
from gettext import gettext as _
import json, mimetypes, os, re, shutil # isort: skip
import json, mimetypes, re, shutil # isort: skip
import urllib.error as urllib_error
import urllib.request as urllib_request
from datetime import datetime
from pathlib import Path
from typing import Any, Final
from kleinanzeigen_bot.model.ad_model import ContactPartial
from .model.ad_model import AdPartial
from .model.config_model import Config
from .utils import dicts, i18n, loggers, misc, reflect
from .utils import dicts, files, i18n, loggers, misc, reflect
from .utils.web_scraping_mixin import Browser, By, Element, WebScrapingMixin
__all__ = [
@@ -44,23 +47,39 @@ class AdExtractor(WebScrapingMixin):
"""
# create sub-directory for ad(s) to download (if necessary):
relative_directory = "downloaded-ads"
# make sure configured base directory exists
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
os.mkdir(relative_directory)
LOG.info("Created ads directory at ./%s.", relative_directory)
relative_directory = Path("downloaded-ads")
# make sure configured base directory exists (using exist_ok=True to avoid TOCTOU race)
await asyncio.get_running_loop().run_in_executor(None, lambda: relative_directory.mkdir(exist_ok = True)) # noqa: ASYNC240
LOG.info("Ensured ads directory exists at ./%s.", relative_directory)
# Extract ad info and determine final directory path
ad_cfg, final_dir = await self._extract_ad_page_info_with_directory_handling(
relative_directory, ad_id
)
# Save the ad configuration file
ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml"
dicts.save_dict(
ad_file_path,
ad_cfg.model_dump(),
header = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json")
# Save the ad configuration file (offload to executor to avoid blocking the event loop)
ad_file_path = str(Path(final_dir) / f"ad_{ad_id}.yaml")
header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
await asyncio.get_running_loop().run_in_executor(
None,
lambda: dicts.save_dict(ad_file_path, ad_cfg.model_dump(), header = header_string)
)
@staticmethod
def _download_and_save_image_sync(url:str, directory:str, filename_prefix:str, img_nr:int) -> str | None:
try:
with urllib_request.urlopen(url) as response: # noqa: S310 Audit URL open for permitted schemes.
content_type = response.info().get_content_type()
file_ending = mimetypes.guess_extension(content_type) or ""
# Use pathlib.Path for OS-agnostic path handling
img_path = Path(directory) / f"{filename_prefix}{img_nr}{file_ending}"
with open(img_path, "wb") as f:
shutil.copyfileobj(response, f)
return str(img_path)
except (urllib_error.URLError, urllib_error.HTTPError, OSError, shutil.Error) as e:
# Narrow exception handling to expected network/filesystem errors
LOG.warning("Failed to download image %s: %s", url, e)
return None
async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[str]:
"""
@@ -85,19 +104,26 @@ class AdExtractor(WebScrapingMixin):
img_nr = 1
dl_counter = 0
loop = asyncio.get_running_loop()
for img_element in images:
current_img_url = img_element.attrs["src"] # URL of the image
if current_img_url is None:
continue
with urllib_request.urlopen(str(current_img_url)) as response: # noqa: S310 Audit URL open for permitted schemes.
content_type = response.info().get_content_type()
file_ending = mimetypes.guess_extension(content_type)
img_path = f"{directory}/{img_fn_prefix}{img_nr}{file_ending}"
with open(img_path, "wb") as f:
shutil.copyfileobj(response, f)
img_path = await loop.run_in_executor(
None,
self._download_and_save_image_sync,
str(current_img_url),
directory,
img_fn_prefix,
img_nr
)
if img_path:
dl_counter += 1
img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1])
# Use pathlib.Path for OS-agnostic path handling
img_paths.append(Path(img_path).name)
img_nr += 1
LOG.info("Downloaded %s.", i18n.pluralize("image", dl_counter))
@@ -354,8 +380,8 @@ class AdExtractor(WebScrapingMixin):
return ad_cfg
async def _extract_ad_page_info_with_directory_handling(
self, relative_directory:str, ad_id:int
) -> tuple[AdPartial, str]:
self, relative_directory:Path, ad_id:int
) -> tuple[AdPartial, Path]:
"""
Extracts ad information and handles directory creation/renaming.
@@ -373,32 +399,37 @@ class AdExtractor(WebScrapingMixin):
# Determine the final directory path
sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length)
final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
final_dir = relative_directory / f"ad_{ad_id}_{sanitized_title}"
temp_dir = relative_directory / f"ad_{ad_id}"
loop = asyncio.get_running_loop()
# Handle existing directories
if os.path.exists(final_dir):
if await files.exists(final_dir):
# If the folder with title already exists, delete it
LOG.info("Deleting current folder of ad %s...", ad_id)
shutil.rmtree(final_dir)
LOG.debug("Removing directory tree: %s", final_dir)
await loop.run_in_executor(None, shutil.rmtree, str(final_dir))
if os.path.exists(temp_dir):
if await files.exists(temp_dir):
if self.config.download.rename_existing_folders:
# Rename the old folder to the new name with title
LOG.info("Renaming folder from %s to %s for ad %s...",
os.path.basename(temp_dir), os.path.basename(final_dir), ad_id)
os.rename(temp_dir, final_dir)
temp_dir.name, final_dir.name, ad_id)
LOG.debug("Renaming: %s -> %s", temp_dir, final_dir)
await loop.run_in_executor(None, temp_dir.rename, final_dir)
else:
# Use the existing folder without renaming
final_dir = temp_dir
LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir)
else:
# Create new directory with title
os.mkdir(final_dir)
LOG.debug("Creating new directory: %s", final_dir)
await loop.run_in_executor(None, final_dir.mkdir)
LOG.info("New directory for ad created at %s.", final_dir)
# Now extract complete ad info (including images) to the final directory
ad_cfg = await self._extract_ad_page_info(final_dir, ad_id)
ad_cfg = await self._extract_ad_page_info(str(final_dir), ad_id)
return ad_cfg, final_dir

View File

@@ -173,7 +173,10 @@ kleinanzeigen_bot/__init__.py:
kleinanzeigen_bot/extract.py:
#################################################
download_ad:
"Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
"Ensured ads directory exists at ./%s.": "Verzeichnis [%s] für Anzeige vorhanden."
_download_and_save_image_sync:
"Failed to download image %s: %s": "Fehler beim Herunterladen des Bildes %s: %s"
_download_images_from_ad_page:
"Found %s.": "%s gefunden."

View File

@@ -1,7 +1,8 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import os
import asyncio, os # isort: skip
from pathlib import Path
def abspath(relative_path:str, relative_to:str | None = None) -> str:
@@ -24,3 +25,23 @@ def abspath(relative_path:str, relative_to:str | None = None) -> str:
base = os.path.dirname(base)
return os.path.normpath(os.path.join(base, relative_path))
async def exists(path:str | Path) -> bool:
"""
Asynchronously check if a file or directory exists.
:param path: Path to check
:return: True if path exists, False otherwise
"""
return await asyncio.get_running_loop().run_in_executor(None, Path(path).exists)
async def is_dir(path:str | Path) -> bool:
"""
Asynchronously check if a path is a directory.
:param path: Path to check
:return: True if path is a directory, False otherwise
"""
return await asyncio.get_running_loop().run_in_executor(None, Path(path).is_dir)

View File

@@ -22,7 +22,7 @@ from nodriver.core.tab import Tab as Page
from kleinanzeigen_bot.model.config_model import Config as BotConfig
from kleinanzeigen_bot.model.config_model import TimeoutConfig
from . import loggers, net
from . import files, loggers, net
from .chrome_version_detector import (
ChromeVersionInfo,
detect_chrome_version_from_binary,
@@ -100,6 +100,37 @@ class BrowserConfig:
self.profile_name:str | None = None
def _write_initial_prefs(prefs_file:str) -> None:
with open(prefs_file, "w", encoding = "UTF-8") as fd:
json.dump({
"credentials_enable_service": False,
"enable_do_not_track": True,
"google": {
"services": {
"consented_to_sync": False
}
},
"profile": {
"default_content_setting_values": {
"popups": 0,
"notifications": 2 # 1 = allow, 2 = block browser notifications
},
"password_manager_enabled": False
},
"signin": {
"allowed": False
},
"translate_site_blacklist": [
"www.kleinanzeigen.de"
],
"devtools": {
"preferences": {
"currentDockState": '"bottom"'
}
}
}, fd)
class WebScrapingMixin:
def __init__(self) -> None:
@@ -174,7 +205,7 @@ class WebScrapingMixin:
LOG.info("Creating Browser session...")
if self.browser_config.binary_location:
ensure(os.path.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
ensure(await files.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
else:
self.browser_config.binary_location = self.get_compatible_browser()
LOG.info(" -> Browser binary location: %s", self.browser_config.binary_location)
@@ -289,41 +320,14 @@ class WebScrapingMixin:
profile_dir = os.path.join(cfg.user_data_dir, self.browser_config.profile_name or "Default")
os.makedirs(profile_dir, exist_ok = True)
prefs_file = os.path.join(profile_dir, "Preferences")
if not os.path.exists(prefs_file):
if not await files.exists(prefs_file):
LOG.info(" -> Setting chrome prefs [%s]...", prefs_file)
with open(prefs_file, "w", encoding = "UTF-8") as fd:
json.dump({
"credentials_enable_service": False,
"enable_do_not_track": True,
"google": {
"services": {
"consented_to_sync": False
}
},
"profile": {
"default_content_setting_values": {
"popups": 0,
"notifications": 2 # 1 = allow, 2 = block browser notifications
},
"password_manager_enabled": False
},
"signin": {
"allowed": False
},
"translate_site_blacklist": [
"www.kleinanzeigen.de"
],
"devtools": {
"preferences": {
"currentDockState": '"bottom"'
}
}
}, fd)
await asyncio.get_running_loop().run_in_executor(None, _write_initial_prefs, prefs_file)
# load extensions
for crx_extension in self.browser_config.extensions:
LOG.info(" -> Adding Browser extension: [%s]", crx_extension)
ensure(os.path.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
ensure(await files.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
cfg.add_extension(crx_extension)
try: