mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
feat: enhanced folder naming (#599)
This commit is contained in:
@@ -45,15 +45,35 @@ class AdExtractor(WebScrapingMixin):
|
||||
os.mkdir(relative_directory)
|
||||
LOG.info("Created ads directory at ./%s.", relative_directory)
|
||||
|
||||
new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}")
|
||||
# First, extract ad info to get the title
|
||||
temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
|
||||
ad_cfg:AdPartial = await self._extract_ad_page_info(temp_dir, ad_id)
|
||||
|
||||
# Create folder name with ad title
|
||||
sanitized_title = misc.sanitize_folder_name(ad_cfg.title, self.config.download.folder_name_max_length)
|
||||
new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
|
||||
|
||||
# If the folder with title already exists, delete it
|
||||
if os.path.exists(new_base_dir):
|
||||
LOG.info("Deleting current folder of ad %s...", ad_id)
|
||||
shutil.rmtree(new_base_dir)
|
||||
os.mkdir(new_base_dir)
|
||||
LOG.info("New directory for ad created at %s.", new_base_dir)
|
||||
|
||||
# call extraction function
|
||||
ad_cfg:AdPartial = await self._extract_ad_page_info(new_base_dir, ad_id)
|
||||
# If the old folder without title exists, handle based on configuration
|
||||
if os.path.exists(temp_dir):
|
||||
if self.config.download.rename_existing_folders:
|
||||
LOG.info("Renaming folder from %s to %s for ad %s...",
|
||||
os.path.basename(temp_dir), os.path.basename(new_base_dir), ad_id)
|
||||
os.rename(temp_dir, new_base_dir)
|
||||
else:
|
||||
# Use the existing folder without renaming
|
||||
new_base_dir = temp_dir
|
||||
LOG.info("Using existing folder for ad %s at %s.", ad_id, new_base_dir)
|
||||
else:
|
||||
# Create new directory with title
|
||||
os.mkdir(new_base_dir)
|
||||
LOG.info("New directory for ad created at %s.", new_base_dir)
|
||||
|
||||
# Save the ad configuration file
|
||||
ad_file_path = new_base_dir + "/" + f"ad_{ad_id}.yaml"
|
||||
dicts.save_dict(
|
||||
ad_file_path,
|
||||
|
||||
@@ -66,6 +66,16 @@ class DownloadConfig(ContextualModel):
|
||||
default_factory = list,
|
||||
description = "list of shipping options to exclude, e.g. ['DHL_2', 'DHL_5']"
|
||||
)
|
||||
folder_name_max_length:int = Field(
|
||||
default = 100,
|
||||
ge = 10,
|
||||
le = 255,
|
||||
description = "maximum length for folder names when downloading ads (default: 100)"
|
||||
)
|
||||
rename_existing_folders:bool = Field(
|
||||
default = False,
|
||||
description = "if true, rename existing folders without titles to include titles (default: false)"
|
||||
)
|
||||
|
||||
|
||||
class BrowserConfig(ContextualModel):
|
||||
|
||||
@@ -176,6 +176,8 @@ kleinanzeigen_bot/extract.py:
|
||||
"Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
|
||||
"Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..."
|
||||
"New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s."
|
||||
"Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..."
|
||||
"Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s."
|
||||
|
||||
_download_images_from_ad_page:
|
||||
"Found %s.": "%s gefunden."
|
||||
|
||||
@@ -2,11 +2,14 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||
import asyncio, decimal, re, sys, time # isort: skip
|
||||
import unicodedata
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from gettext import gettext as _
|
||||
from typing import Any, Mapping, TypeVar
|
||||
|
||||
from sanitize_filename import sanitize
|
||||
|
||||
from . import i18n
|
||||
|
||||
# https://mypy.readthedocs.io/en/stable/generics.html#generic-functions
|
||||
@@ -263,3 +266,36 @@ def format_timedelta(td:timedelta) -> str:
|
||||
parts.append(i18n.pluralize("second", seconds))
|
||||
|
||||
return ", ".join(parts) if parts else i18n.pluralize("second", 0)
|
||||
|
||||
|
||||
def sanitize_folder_name(name:str, max_length:int = 100) -> str:
|
||||
"""
|
||||
Sanitize a string for use as a folder name using `sanitize-filename`.
|
||||
|
||||
- Cross-platform safe (Windows/macOS/Linux)
|
||||
- Removes invalid characters and Windows reserved names
|
||||
- Handles path traversal attempts
|
||||
- Truncates to `max_length`
|
||||
|
||||
Args:
|
||||
name: The input string.
|
||||
max_length: Maximum length of the resulting folder name (default: 100).
|
||||
|
||||
Returns:
|
||||
A sanitized folder name (falls back to "untitled" when empty).
|
||||
"""
|
||||
# Normalize whitespace and handle empty input
|
||||
raw = (name or "").strip()
|
||||
if not raw:
|
||||
return "untitled"
|
||||
|
||||
raw = unicodedata.normalize("NFC", raw)
|
||||
safe:str = sanitize(raw)
|
||||
|
||||
# Truncate with word-boundary preference
|
||||
if len(safe) > max_length:
|
||||
truncated = safe[:max_length]
|
||||
last_break = max(truncated.rfind(" "), truncated.rfind("_"))
|
||||
safe = truncated[:last_break] if last_break > int(max_length * 0.7) else truncated
|
||||
|
||||
return safe
|
||||
|
||||
Reference in New Issue
Block a user