feat: enhanced folder naming (#599)

This commit is contained in:
Jens Bergmann
2025-08-12 10:43:26 +02:00
committed by GitHub
parent 1e0c7216ad
commit 91a40b0116
11 changed files with 369 additions and 25 deletions

View File

@@ -45,15 +45,35 @@ class AdExtractor(WebScrapingMixin):
os.mkdir(relative_directory)
LOG.info("Created ads directory at ./%s.", relative_directory)
new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}")
# First, extract ad info to get the title
temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
ad_cfg:AdPartial = await self._extract_ad_page_info(temp_dir, ad_id)
# Create folder name with ad title
sanitized_title = misc.sanitize_folder_name(ad_cfg.title, self.config.download.folder_name_max_length)
new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
# If the folder with title already exists, delete it
if os.path.exists(new_base_dir):
LOG.info("Deleting current folder of ad %s...", ad_id)
shutil.rmtree(new_base_dir)
os.mkdir(new_base_dir)
LOG.info("New directory for ad created at %s.", new_base_dir)
# call extraction function
ad_cfg:AdPartial = await self._extract_ad_page_info(new_base_dir, ad_id)
# If the old folder without title exists, handle based on configuration
if os.path.exists(temp_dir):
if self.config.download.rename_existing_folders:
LOG.info("Renaming folder from %s to %s for ad %s...",
os.path.basename(temp_dir), os.path.basename(new_base_dir), ad_id)
os.rename(temp_dir, new_base_dir)
else:
# Use the existing folder without renaming
new_base_dir = temp_dir
LOG.info("Using existing folder for ad %s at %s.", ad_id, new_base_dir)
else:
# Create new directory with title
os.mkdir(new_base_dir)
LOG.info("New directory for ad created at %s.", new_base_dir)
# Save the ad configuration file
ad_file_path = new_base_dir + "/" + f"ad_{ad_id}.yaml"
dicts.save_dict(
ad_file_path,

View File

@@ -66,6 +66,16 @@ class DownloadConfig(ContextualModel):
default_factory = list,
description = "list of shipping options to exclude, e.g. ['DHL_2', 'DHL_5']"
)
folder_name_max_length:int = Field(
default = 100,
ge = 10,
le = 255,
description = "maximum length for folder names when downloading ads (default: 100)"
)
rename_existing_folders:bool = Field(
default = False,
description = "if true, rename existing folders without titles to include titles (default: false)"
)
class BrowserConfig(ContextualModel):

View File

@@ -176,6 +176,8 @@ kleinanzeigen_bot/extract.py:
"Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
"Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..."
"New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s."
"Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..."
"Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s."
_download_images_from_ad_page:
"Found %s.": "%s gefunden."

View File

@@ -2,11 +2,14 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import asyncio, decimal, re, sys, time # isort: skip
import unicodedata
from collections.abc import Callable
from datetime import datetime, timedelta, timezone
from gettext import gettext as _
from typing import Any, Mapping, TypeVar
from sanitize_filename import sanitize
from . import i18n
# https://mypy.readthedocs.io/en/stable/generics.html#generic-functions
@@ -263,3 +266,36 @@ def format_timedelta(td:timedelta) -> str:
parts.append(i18n.pluralize("second", seconds))
return ", ".join(parts) if parts else i18n.pluralize("second", 0)
def sanitize_folder_name(name:str, max_length:int = 100) -> str:
"""
Sanitize a string for use as a folder name using `sanitize-filename`.
- Cross-platform safe (Windows/macOS/Linux)
- Removes invalid characters and Windows reserved names
- Handles path traversal attempts
- Truncates to `max_length`
Args:
name: The input string.
max_length: Maximum length of the resulting folder name (default: 100).
Returns:
A sanitized folder name (falls back to "untitled" when empty).
"""
# Normalize whitespace and handle empty input
raw = (name or "").strip()
if not raw:
return "untitled"
raw = unicodedata.normalize("NFC", raw)
safe:str = sanitize(raw)
# Truncate with word-boundary preference
if len(safe) > max_length:
truncated = safe[:max_length]
last_break = max(truncated.rfind(" "), truncated.rfind("_"))
safe = truncated[:last_break] if last_break > int(max_length * 0.7) else truncated
return safe