From 0b995fae18b77fb55cde2053047a51fb0d7d49ca Mon Sep 17 00:00:00 2001 From: Jens <1742418+1cu@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:46:10 +0100 Subject: [PATCH] fix: handle Unicode normalization in save_dict for umlauts (#728) (#729) --- src/kleinanzeigen_bot/utils/dicts.py | 11 ++++- src/kleinanzeigen_bot/utils/misc.py | 6 ++- tests/unit/test_dicts.py | 40 ++++++++++++++++ tests/unit/test_extract.py | 68 ++++++++++++++++++++++++++++ tests/unit/test_utils_misc.py | 6 +-- 5 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 tests/unit/test_dicts.py diff --git a/src/kleinanzeigen_bot/utils/dicts.py b/src/kleinanzeigen_bot/utils/dicts.py index 3ee97b3..e1cae97 100644 --- a/src/kleinanzeigen_bot/utils/dicts.py +++ b/src/kleinanzeigen_bot/utils/dicts.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: © Sebastian Thomschke and contributors # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ -import copy, json, os # isort: skip +import copy, json, os, unicodedata # isort: skip from collections import defaultdict from collections.abc import Callable from gettext import gettext as _ @@ -112,7 +112,14 @@ def load_dict_from_module(module:ModuleType, filename:str, content_label:str = " def save_dict(filepath:str | Path, content:dict[str, Any], *, header:str | None = None) -> None: - filepath = Path(filepath).resolve(strict = False) + # Normalize filepath to NFC for cross-platform consistency (issue #728) + # Ensures file paths match NFC-normalized directory names from sanitize_folder_name() + # Also handles edge cases where paths don't originate from sanitize_folder_name() + filepath = Path(unicodedata.normalize("NFC", str(filepath))) + + # Create parent directory if needed + filepath.parent.mkdir(parents = True, exist_ok = True) + LOG.info("Saving [%s]...", filepath) with open(filepath, "w", encoding = "utf-8") as file: if header: diff --git a/src/kleinanzeigen_bot/utils/misc.py b/src/kleinanzeigen_bot/utils/misc.py index ce140fd..767d738 100644 --- a/src/kleinanzeigen_bot/utils/misc.py +++ b/src/kleinanzeigen_bot/utils/misc.py @@ -289,8 +289,12 @@ def sanitize_folder_name(name:str, max_length:int = 100) -> str: if not raw: return "untitled" - raw = unicodedata.normalize("NFC", raw) + # Apply sanitization, then normalize to NFC + # Note: sanitize-filename converts to NFD, so we must normalize AFTER sanitizing + # to ensure consistent NFC encoding across platforms (macOS HFS+, Linux, Windows) + # This prevents path mismatches when saving files to sanitized directories (issue #728) safe:str = sanitize(raw) + safe = unicodedata.normalize("NFC", safe) # Truncate with word-boundary preference if len(safe) > max_length: diff --git a/tests/unit/test_dicts.py b/tests/unit/test_dicts.py new file mode 100644 index 0000000..b451d20 --- /dev/null +++ b/tests/unit/test_dicts.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ +"""Tests for the dicts utility module.""" +import unicodedata +from pathlib import Path + + +def test_save_dict_normalizes_unicode_paths(tmp_path:Path) -> None: + """Test that save_dict normalizes paths to NFC for cross-platform consistency (issue #728). + + Directories are created with NFC normalization (via sanitize_folder_name). + This test verifies save_dict's defensive normalization handles edge cases where + an NFD path is passed (e.g., "ä" as "a" + combining diacritic vs single character). + It should normalize to NFC and use the existing NFC directory. + """ + from kleinanzeigen_bot.utils import dicts # noqa: PLC0415 + + # Create directory with NFC normalization (as sanitize_folder_name does) + title_nfc = unicodedata.normalize("NFC", "KitchenAid Zuhälter - nie benutzt") + nfc_dir = tmp_path / f"ad_12345_{title_nfc}" + nfc_dir.mkdir(parents = True) + + # Call save_dict with NFD path (different normalization) + title_nfd = unicodedata.normalize("NFD", title_nfc) + assert title_nfc != title_nfd, "NFC and NFD should be different strings" + + nfd_path = tmp_path / f"ad_12345_{title_nfd}" / "ad_12345.yaml" + dicts.save_dict(str(nfd_path), {"test": "data", "title": title_nfc}) + + # Verify file was saved successfully + nfc_files = list(nfc_dir.glob("*.yaml")) + assert len(nfc_files) == 1, "Should have exactly one file in NFC directory" + assert nfc_files[0].name == "ad_12345.yaml" + + # On macOS/APFS, the filesystem normalizes both NFC and NFD to the same directory + # On Linux ext4, NFC normalization in save_dict ensures it uses the existing directory + # Either way, we should have exactly one YAML file total (no duplicates) + all_yaml_files = list(tmp_path.rglob("*.yaml")) + assert len(all_yaml_files) == 1, f"Expected exactly 1 YAML file total, found {len(all_yaml_files)}: {all_yaml_files}" diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py index 40c67a2..4468e2c 100644 --- a/tests/unit/test_extract.py +++ b/tests/unit/test_extract.py @@ -1225,3 +1225,71 @@ class TestAdExtractorDownload: assert result_dir.exists() assert (result_dir / "existing_image.jpg").exists() # File should be preserved assert ad_cfg.title == "Test Title" + + @pytest.mark.asyncio + async def test_download_ad_with_umlauts_in_title(self, extractor:AdExtractor, tmp_path:Path) -> None: + """Test cross-platform Unicode handling for ad titles with umlauts (issue #728). + + Verifies that: + 1. Directories are created with NFC-normalized names (via sanitize_folder_name) + 2. Files can be saved to those directories (via save_dict's NFC normalization) + 3. No FileNotFoundError occurs due to NFC/NFD mismatch on Linux/Windows + """ + # Title with German umlauts (ä) - common in real ads + title_with_umlauts = "KitchenAid Zuhälter - nie benutzt" + + # Mock the page + page_mock = MagicMock() + page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345" + extractor.page = page_mock + + base_dir = tmp_path / "downloaded-ads" + base_dir.mkdir() + + with patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = [ + title_with_umlauts, # Title extraction + title_with_umlauts, # Second title call for full extraction + "Description text", # Description + "03.02.2025" # Creation date + ]), \ + patch.object(extractor, "web_execute", new_callable = AsyncMock, return_value = { + "universalAnalyticsOpts": { + "dimensions": { + "dimension92": "", + "dimension108": "" + } + } + }), \ + patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), \ + patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), \ + patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), \ + patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), \ + patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), \ + patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), \ + patch.object(extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial( + name = "Test", zipcode = "12345", location = "Berlin" + )): + + ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling( + base_dir, 12345 + ) + + # Verify directory was created with NFC-normalized name + assert result_dir.exists() + assert ad_cfg.title == title_with_umlauts + + # Test saving YAML file to the Unicode directory path + # Before fix: Failed on Linux/Windows due to NFC/NFD mismatch + # After fix: Both directory and file use NFC normalization + ad_file_path = Path(result_dir) / "ad_12345.yaml" + + from kleinanzeigen_bot.utils import dicts # noqa: PLC0415 + + header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json" + + # save_dict normalizes path to NFC, matching the NFC directory name + dicts.save_dict(str(ad_file_path), ad_cfg.model_dump(), header = header_string) + + # Verify file was created successfully (no FileNotFoundError) + assert ad_file_path.exists() + assert ad_file_path.is_file() diff --git a/tests/unit/test_utils_misc.py b/tests/unit/test_utils_misc.py index 523ee35..9e5b5ef 100644 --- a/tests/unit/test_utils_misc.py +++ b/tests/unit/test_utils_misc.py @@ -144,9 +144,9 @@ def test_ensure_non_callable_truthy_and_falsy() -> None: # Basic sanitization ("My Ad Title!", "My Ad Title!", "Basic sanitization"), - # Unicode normalization (sanitize-filename changes normalization) - ("café", "cafe\u0301", "Unicode normalization"), - ("caf\u00e9", "cafe\u0301", "Unicode normalization from escaped"), + # Unicode normalization - sanitize-filename converts to NFD, then we normalize to NFC (issue #728) + ("café", "café", "Unicode NFC → NFD (by sanitize) → NFC (by normalize)"), + ("caf\u00e9", "café", "Unicode NFC (escaped) → NFD → NFC"), # Edge cases ("", "untitled", "Empty string"),