fix: handle Unicode normalization in save_dict for umlauts (#728) (#729)

This commit is contained in:
Jens
2025-12-15 20:46:10 +01:00
committed by GitHub
parent 861b8ec367
commit 0b995fae18
5 changed files with 125 additions and 6 deletions

View File

@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import copy, json, os # isort: skip
import copy, json, os, unicodedata # isort: skip
from collections import defaultdict
from collections.abc import Callable
from gettext import gettext as _
@@ -112,7 +112,14 @@ def load_dict_from_module(module:ModuleType, filename:str, content_label:str = "
def save_dict(filepath:str | Path, content:dict[str, Any], *, header:str | None = None) -> None:
filepath = Path(filepath).resolve(strict = False)
# Normalize filepath to NFC for cross-platform consistency (issue #728)
# Ensures file paths match NFC-normalized directory names from sanitize_folder_name()
# Also handles edge cases where paths don't originate from sanitize_folder_name()
filepath = Path(unicodedata.normalize("NFC", str(filepath)))
# Create parent directory if needed
filepath.parent.mkdir(parents = True, exist_ok = True)
LOG.info("Saving [%s]...", filepath)
with open(filepath, "w", encoding = "utf-8") as file:
if header:

View File

@@ -289,8 +289,12 @@ def sanitize_folder_name(name:str, max_length:int = 100) -> str:
if not raw:
return "untitled"
raw = unicodedata.normalize("NFC", raw)
# Apply sanitization, then normalize to NFC
# Note: sanitize-filename converts to NFD, so we must normalize AFTER sanitizing
# to ensure consistent NFC encoding across platforms (macOS HFS+, Linux, Windows)
# This prevents path mismatches when saving files to sanitized directories (issue #728)
safe:str = sanitize(raw)
safe = unicodedata.normalize("NFC", safe)
# Truncate with word-boundary preference
if len(safe) > max_length:

40
tests/unit/test_dicts.py Normal file
View File

@@ -0,0 +1,40 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
"""Tests for the dicts utility module."""
import unicodedata
from pathlib import Path
def test_save_dict_normalizes_unicode_paths(tmp_path:Path) -> None:
"""Test that save_dict normalizes paths to NFC for cross-platform consistency (issue #728).
Directories are created with NFC normalization (via sanitize_folder_name).
This test verifies save_dict's defensive normalization handles edge cases where
an NFD path is passed (e.g., "ä" as "a" + combining diacritic vs single character).
It should normalize to NFC and use the existing NFC directory.
"""
from kleinanzeigen_bot.utils import dicts # noqa: PLC0415
# Create directory with NFC normalization (as sanitize_folder_name does)
title_nfc = unicodedata.normalize("NFC", "KitchenAid Zuhälter - nie benutzt")
nfc_dir = tmp_path / f"ad_12345_{title_nfc}"
nfc_dir.mkdir(parents = True)
# Call save_dict with NFD path (different normalization)
title_nfd = unicodedata.normalize("NFD", title_nfc)
assert title_nfc != title_nfd, "NFC and NFD should be different strings"
nfd_path = tmp_path / f"ad_12345_{title_nfd}" / "ad_12345.yaml"
dicts.save_dict(str(nfd_path), {"test": "data", "title": title_nfc})
# Verify file was saved successfully
nfc_files = list(nfc_dir.glob("*.yaml"))
assert len(nfc_files) == 1, "Should have exactly one file in NFC directory"
assert nfc_files[0].name == "ad_12345.yaml"
# On macOS/APFS, the filesystem normalizes both NFC and NFD to the same directory
# On Linux ext4, NFC normalization in save_dict ensures it uses the existing directory
# Either way, we should have exactly one YAML file total (no duplicates)
all_yaml_files = list(tmp_path.rglob("*.yaml"))
assert len(all_yaml_files) == 1, f"Expected exactly 1 YAML file total, found {len(all_yaml_files)}: {all_yaml_files}"

View File

@@ -1225,3 +1225,71 @@ class TestAdExtractorDownload:
assert result_dir.exists()
assert (result_dir / "existing_image.jpg").exists() # File should be preserved
assert ad_cfg.title == "Test Title"
@pytest.mark.asyncio
async def test_download_ad_with_umlauts_in_title(self, extractor:AdExtractor, tmp_path:Path) -> None:
"""Test cross-platform Unicode handling for ad titles with umlauts (issue #728).
Verifies that:
1. Directories are created with NFC-normalized names (via sanitize_folder_name)
2. Files can be saved to those directories (via save_dict's NFC normalization)
3. No FileNotFoundError occurs due to NFC/NFD mismatch on Linux/Windows
"""
# Title with German umlauts (ä) - common in real ads
title_with_umlauts = "KitchenAid Zuhälter - nie benutzt"
# Mock the page
page_mock = MagicMock()
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
extractor.page = page_mock
base_dir = tmp_path / "downloaded-ads"
base_dir.mkdir()
with patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = [
title_with_umlauts, # Title extraction
title_with_umlauts, # Second title call for full extraction
"Description text", # Description
"03.02.2025" # Creation date
]), \
patch.object(extractor, "web_execute", new_callable = AsyncMock, return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}), \
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), \
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), \
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), \
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), \
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), \
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), \
patch.object(extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial(
name = "Test", zipcode = "12345", location = "Berlin"
)):
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(
base_dir, 12345
)
# Verify directory was created with NFC-normalized name
assert result_dir.exists()
assert ad_cfg.title == title_with_umlauts
# Test saving YAML file to the Unicode directory path
# Before fix: Failed on Linux/Windows due to NFC/NFD mismatch
# After fix: Both directory and file use NFC normalization
ad_file_path = Path(result_dir) / "ad_12345.yaml"
from kleinanzeigen_bot.utils import dicts # noqa: PLC0415
header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
# save_dict normalizes path to NFC, matching the NFC directory name
dicts.save_dict(str(ad_file_path), ad_cfg.model_dump(), header = header_string)
# Verify file was created successfully (no FileNotFoundError)
assert ad_file_path.exists()
assert ad_file_path.is_file()

View File

@@ -144,9 +144,9 @@ def test_ensure_non_callable_truthy_and_falsy() -> None:
# Basic sanitization
("My Ad Title!", "My Ad Title!", "Basic sanitization"),
# Unicode normalization (sanitize-filename changes normalization)
("café", "cafe\u0301", "Unicode normalization"),
("caf\u00e9", "cafe\u0301", "Unicode normalization from escaped"),
# Unicode normalization - sanitize-filename converts to NFD, then we normalize to NFC (issue #728)
("café", "café", "Unicode NFC → NFD (by sanitize) → NFC (by normalize)"),
("caf\u00e9", "café", "Unicode NFC (escaped) → NFD → NFC"),
# Edge cases
("", "untitled", "Empty string"),