mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
fix: eliminate async safety violations and migrate to pathlib (#697)
## ℹ️ Description Eliminate all blocking I/O operations in async contexts and modernize file path handling by migrating from os.path to pathlib.Path. - Link to the related issue(s): #692 - Get rid of the TODO in pyproject.toml - The added debug logging will ease the troubleshooting for path related issues. ## 📋 Changes Summary - Enable ASYNC210, ASYNC230, ASYNC240, ASYNC250 Ruff rules - Wrap blocking urllib.request.urlopen() in run_in_executor - Wrap blocking file operations (open, write) in run_in_executor - Replace blocking os.path calls with async helpers using run_in_executor - Replace blocking input() with await ainput() - Migrate extract.py from os.path to pathlib.Path - Use Path() constructor and / operator for path joining - Use Path.mkdir(), Path.rename() in executor instead of os functions - Create mockable _path_exists() and _path_is_dir() helpers - Add debug logging for all file system operations ### ⚙️ Type of Change Select the type(s) of change(s) included in this pull request: - [X] 🐞 Bug fix (non-breaking change which fixes an issue) - [ ] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist Before requesting a review, confirm the following: - [X] I have reviewed my changes to ensure they meet the project's standards. - [X] I have tested my changes and ensured that all tests pass (`pdm run test`). - [X] I have formatted the code (`pdm run format`). - [X] I have verified that linting passes (`pdm run lint`). - [X] I have updated documentation where necessary. By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Refactor** * Made user prompt non‑blocking to improve responsiveness. * Converted filesystem/path handling and prefs I/O to async‑friendly operations; moved blocking network and file work to background tasks. * Added async file/path helpers and async port‑check before browser connections. * **Tests** * Expanded unit tests for path helpers, image download success/failure, prefs writing, and directory creation/renaming workflows. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -937,7 +937,7 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
LOG.warning("# Payment form detected! Please proceed with payment.")
|
||||
LOG.warning("############################################")
|
||||
await self.web_scroll_page_down()
|
||||
input(_("Press a key to continue..."))
|
||||
await ainput(_("Press a key to continue..."))
|
||||
except TimeoutError:
|
||||
pass
|
||||
|
||||
@@ -1108,7 +1108,7 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
# in some categories we need to go another dialog back
|
||||
try:
|
||||
await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]',
|
||||
timeout=short_timeout)
|
||||
timeout = short_timeout)
|
||||
except TimeoutError:
|
||||
await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]')
|
||||
|
||||
|
||||
@@ -1,18 +1,21 @@
|
||||
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||
import asyncio
|
||||
from gettext import gettext as _
|
||||
|
||||
import json, mimetypes, os, re, shutil # isort: skip
|
||||
import json, mimetypes, re, shutil # isort: skip
|
||||
import urllib.error as urllib_error
|
||||
import urllib.request as urllib_request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Final
|
||||
|
||||
from kleinanzeigen_bot.model.ad_model import ContactPartial
|
||||
|
||||
from .model.ad_model import AdPartial
|
||||
from .model.config_model import Config
|
||||
from .utils import dicts, i18n, loggers, misc, reflect
|
||||
from .utils import dicts, files, i18n, loggers, misc, reflect
|
||||
from .utils.web_scraping_mixin import Browser, By, Element, WebScrapingMixin
|
||||
|
||||
__all__ = [
|
||||
@@ -44,23 +47,39 @@ class AdExtractor(WebScrapingMixin):
|
||||
"""
|
||||
|
||||
# create sub-directory for ad(s) to download (if necessary):
|
||||
relative_directory = "downloaded-ads"
|
||||
# make sure configured base directory exists
|
||||
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
|
||||
os.mkdir(relative_directory)
|
||||
LOG.info("Created ads directory at ./%s.", relative_directory)
|
||||
relative_directory = Path("downloaded-ads")
|
||||
# make sure configured base directory exists (using exist_ok=True to avoid TOCTOU race)
|
||||
await asyncio.get_running_loop().run_in_executor(None, lambda: relative_directory.mkdir(exist_ok = True)) # noqa: ASYNC240
|
||||
LOG.info("Ensured ads directory exists at ./%s.", relative_directory)
|
||||
|
||||
# Extract ad info and determine final directory path
|
||||
ad_cfg, final_dir = await self._extract_ad_page_info_with_directory_handling(
|
||||
relative_directory, ad_id
|
||||
)
|
||||
|
||||
# Save the ad configuration file
|
||||
ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml"
|
||||
dicts.save_dict(
|
||||
ad_file_path,
|
||||
ad_cfg.model_dump(),
|
||||
header = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json")
|
||||
# Save the ad configuration file (offload to executor to avoid blocking the event loop)
|
||||
ad_file_path = str(Path(final_dir) / f"ad_{ad_id}.yaml")
|
||||
header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
|
||||
await asyncio.get_running_loop().run_in_executor(
|
||||
None,
|
||||
lambda: dicts.save_dict(ad_file_path, ad_cfg.model_dump(), header = header_string)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _download_and_save_image_sync(url:str, directory:str, filename_prefix:str, img_nr:int) -> str | None:
|
||||
try:
|
||||
with urllib_request.urlopen(url) as response: # noqa: S310 Audit URL open for permitted schemes.
|
||||
content_type = response.info().get_content_type()
|
||||
file_ending = mimetypes.guess_extension(content_type) or ""
|
||||
# Use pathlib.Path for OS-agnostic path handling
|
||||
img_path = Path(directory) / f"{filename_prefix}{img_nr}{file_ending}"
|
||||
with open(img_path, "wb") as f:
|
||||
shutil.copyfileobj(response, f)
|
||||
return str(img_path)
|
||||
except (urllib_error.URLError, urllib_error.HTTPError, OSError, shutil.Error) as e:
|
||||
# Narrow exception handling to expected network/filesystem errors
|
||||
LOG.warning("Failed to download image %s: %s", url, e)
|
||||
return None
|
||||
|
||||
async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[str]:
|
||||
"""
|
||||
@@ -85,19 +104,26 @@ class AdExtractor(WebScrapingMixin):
|
||||
img_nr = 1
|
||||
dl_counter = 0
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
for img_element in images:
|
||||
current_img_url = img_element.attrs["src"] # URL of the image
|
||||
if current_img_url is None:
|
||||
continue
|
||||
|
||||
with urllib_request.urlopen(str(current_img_url)) as response: # noqa: S310 Audit URL open for permitted schemes.
|
||||
content_type = response.info().get_content_type()
|
||||
file_ending = mimetypes.guess_extension(content_type)
|
||||
img_path = f"{directory}/{img_fn_prefix}{img_nr}{file_ending}"
|
||||
with open(img_path, "wb") as f:
|
||||
shutil.copyfileobj(response, f)
|
||||
img_path = await loop.run_in_executor(
|
||||
None,
|
||||
self._download_and_save_image_sync,
|
||||
str(current_img_url),
|
||||
directory,
|
||||
img_fn_prefix,
|
||||
img_nr
|
||||
)
|
||||
|
||||
if img_path:
|
||||
dl_counter += 1
|
||||
img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1])
|
||||
# Use pathlib.Path for OS-agnostic path handling
|
||||
img_paths.append(Path(img_path).name)
|
||||
|
||||
img_nr += 1
|
||||
LOG.info("Downloaded %s.", i18n.pluralize("image", dl_counter))
|
||||
@@ -354,8 +380,8 @@ class AdExtractor(WebScrapingMixin):
|
||||
return ad_cfg
|
||||
|
||||
async def _extract_ad_page_info_with_directory_handling(
|
||||
self, relative_directory:str, ad_id:int
|
||||
) -> tuple[AdPartial, str]:
|
||||
self, relative_directory:Path, ad_id:int
|
||||
) -> tuple[AdPartial, Path]:
|
||||
"""
|
||||
Extracts ad information and handles directory creation/renaming.
|
||||
|
||||
@@ -373,32 +399,37 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
# Determine the final directory path
|
||||
sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length)
|
||||
final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
|
||||
temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
|
||||
final_dir = relative_directory / f"ad_{ad_id}_{sanitized_title}"
|
||||
temp_dir = relative_directory / f"ad_{ad_id}"
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
# Handle existing directories
|
||||
if os.path.exists(final_dir):
|
||||
if await files.exists(final_dir):
|
||||
# If the folder with title already exists, delete it
|
||||
LOG.info("Deleting current folder of ad %s...", ad_id)
|
||||
shutil.rmtree(final_dir)
|
||||
LOG.debug("Removing directory tree: %s", final_dir)
|
||||
await loop.run_in_executor(None, shutil.rmtree, str(final_dir))
|
||||
|
||||
if os.path.exists(temp_dir):
|
||||
if await files.exists(temp_dir):
|
||||
if self.config.download.rename_existing_folders:
|
||||
# Rename the old folder to the new name with title
|
||||
LOG.info("Renaming folder from %s to %s for ad %s...",
|
||||
os.path.basename(temp_dir), os.path.basename(final_dir), ad_id)
|
||||
os.rename(temp_dir, final_dir)
|
||||
temp_dir.name, final_dir.name, ad_id)
|
||||
LOG.debug("Renaming: %s -> %s", temp_dir, final_dir)
|
||||
await loop.run_in_executor(None, temp_dir.rename, final_dir)
|
||||
else:
|
||||
# Use the existing folder without renaming
|
||||
final_dir = temp_dir
|
||||
LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir)
|
||||
else:
|
||||
# Create new directory with title
|
||||
os.mkdir(final_dir)
|
||||
LOG.debug("Creating new directory: %s", final_dir)
|
||||
await loop.run_in_executor(None, final_dir.mkdir)
|
||||
LOG.info("New directory for ad created at %s.", final_dir)
|
||||
|
||||
# Now extract complete ad info (including images) to the final directory
|
||||
ad_cfg = await self._extract_ad_page_info(final_dir, ad_id)
|
||||
ad_cfg = await self._extract_ad_page_info(str(final_dir), ad_id)
|
||||
|
||||
return ad_cfg, final_dir
|
||||
|
||||
|
||||
@@ -173,7 +173,10 @@ kleinanzeigen_bot/__init__.py:
|
||||
kleinanzeigen_bot/extract.py:
|
||||
#################################################
|
||||
download_ad:
|
||||
"Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
|
||||
"Ensured ads directory exists at ./%s.": "Verzeichnis [%s] für Anzeige vorhanden."
|
||||
|
||||
_download_and_save_image_sync:
|
||||
"Failed to download image %s: %s": "Fehler beim Herunterladen des Bildes %s: %s"
|
||||
|
||||
_download_images_from_ad_page:
|
||||
"Found %s.": "%s gefunden."
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||
import os
|
||||
import asyncio, os # isort: skip
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def abspath(relative_path:str, relative_to:str | None = None) -> str:
|
||||
@@ -24,3 +25,23 @@ def abspath(relative_path:str, relative_to:str | None = None) -> str:
|
||||
base = os.path.dirname(base)
|
||||
|
||||
return os.path.normpath(os.path.join(base, relative_path))
|
||||
|
||||
|
||||
async def exists(path:str | Path) -> bool:
|
||||
"""
|
||||
Asynchronously check if a file or directory exists.
|
||||
|
||||
:param path: Path to check
|
||||
:return: True if path exists, False otherwise
|
||||
"""
|
||||
return await asyncio.get_running_loop().run_in_executor(None, Path(path).exists)
|
||||
|
||||
|
||||
async def is_dir(path:str | Path) -> bool:
|
||||
"""
|
||||
Asynchronously check if a path is a directory.
|
||||
|
||||
:param path: Path to check
|
||||
:return: True if path is a directory, False otherwise
|
||||
"""
|
||||
return await asyncio.get_running_loop().run_in_executor(None, Path(path).is_dir)
|
||||
|
||||
@@ -22,7 +22,7 @@ from nodriver.core.tab import Tab as Page
|
||||
from kleinanzeigen_bot.model.config_model import Config as BotConfig
|
||||
from kleinanzeigen_bot.model.config_model import TimeoutConfig
|
||||
|
||||
from . import loggers, net
|
||||
from . import files, loggers, net
|
||||
from .chrome_version_detector import (
|
||||
ChromeVersionInfo,
|
||||
detect_chrome_version_from_binary,
|
||||
@@ -100,6 +100,37 @@ class BrowserConfig:
|
||||
self.profile_name:str | None = None
|
||||
|
||||
|
||||
def _write_initial_prefs(prefs_file:str) -> None:
|
||||
with open(prefs_file, "w", encoding = "UTF-8") as fd:
|
||||
json.dump({
|
||||
"credentials_enable_service": False,
|
||||
"enable_do_not_track": True,
|
||||
"google": {
|
||||
"services": {
|
||||
"consented_to_sync": False
|
||||
}
|
||||
},
|
||||
"profile": {
|
||||
"default_content_setting_values": {
|
||||
"popups": 0,
|
||||
"notifications": 2 # 1 = allow, 2 = block browser notifications
|
||||
},
|
||||
"password_manager_enabled": False
|
||||
},
|
||||
"signin": {
|
||||
"allowed": False
|
||||
},
|
||||
"translate_site_blacklist": [
|
||||
"www.kleinanzeigen.de"
|
||||
],
|
||||
"devtools": {
|
||||
"preferences": {
|
||||
"currentDockState": '"bottom"'
|
||||
}
|
||||
}
|
||||
}, fd)
|
||||
|
||||
|
||||
class WebScrapingMixin:
|
||||
|
||||
def __init__(self) -> None:
|
||||
@@ -174,7 +205,7 @@ class WebScrapingMixin:
|
||||
LOG.info("Creating Browser session...")
|
||||
|
||||
if self.browser_config.binary_location:
|
||||
ensure(os.path.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
|
||||
ensure(await files.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
|
||||
else:
|
||||
self.browser_config.binary_location = self.get_compatible_browser()
|
||||
LOG.info(" -> Browser binary location: %s", self.browser_config.binary_location)
|
||||
@@ -289,41 +320,14 @@ class WebScrapingMixin:
|
||||
profile_dir = os.path.join(cfg.user_data_dir, self.browser_config.profile_name or "Default")
|
||||
os.makedirs(profile_dir, exist_ok = True)
|
||||
prefs_file = os.path.join(profile_dir, "Preferences")
|
||||
if not os.path.exists(prefs_file):
|
||||
if not await files.exists(prefs_file):
|
||||
LOG.info(" -> Setting chrome prefs [%s]...", prefs_file)
|
||||
with open(prefs_file, "w", encoding = "UTF-8") as fd:
|
||||
json.dump({
|
||||
"credentials_enable_service": False,
|
||||
"enable_do_not_track": True,
|
||||
"google": {
|
||||
"services": {
|
||||
"consented_to_sync": False
|
||||
}
|
||||
},
|
||||
"profile": {
|
||||
"default_content_setting_values": {
|
||||
"popups": 0,
|
||||
"notifications": 2 # 1 = allow, 2 = block browser notifications
|
||||
},
|
||||
"password_manager_enabled": False
|
||||
},
|
||||
"signin": {
|
||||
"allowed": False
|
||||
},
|
||||
"translate_site_blacklist": [
|
||||
"www.kleinanzeigen.de"
|
||||
],
|
||||
"devtools": {
|
||||
"preferences": {
|
||||
"currentDockState": '"bottom"'
|
||||
}
|
||||
}
|
||||
}, fd)
|
||||
await asyncio.get_running_loop().run_in_executor(None, _write_initial_prefs, prefs_file)
|
||||
|
||||
# load extensions
|
||||
for crx_extension in self.browser_config.extensions:
|
||||
LOG.info(" -> Adding Browser extension: [%s]", crx_extension)
|
||||
ensure(os.path.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
|
||||
ensure(await files.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
|
||||
cfg.add_extension(crx_extension)
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user