mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
||||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||||
import copy, json, os # isort: skip
|
import copy, json, os, unicodedata # isort: skip
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from gettext import gettext as _
|
from gettext import gettext as _
|
||||||
@@ -112,7 +112,14 @@ def load_dict_from_module(module:ModuleType, filename:str, content_label:str = "
|
|||||||
|
|
||||||
|
|
||||||
def save_dict(filepath:str | Path, content:dict[str, Any], *, header:str | None = None) -> None:
|
def save_dict(filepath:str | Path, content:dict[str, Any], *, header:str | None = None) -> None:
|
||||||
filepath = Path(filepath).resolve(strict = False)
|
# Normalize filepath to NFC for cross-platform consistency (issue #728)
|
||||||
|
# Ensures file paths match NFC-normalized directory names from sanitize_folder_name()
|
||||||
|
# Also handles edge cases where paths don't originate from sanitize_folder_name()
|
||||||
|
filepath = Path(unicodedata.normalize("NFC", str(filepath)))
|
||||||
|
|
||||||
|
# Create parent directory if needed
|
||||||
|
filepath.parent.mkdir(parents = True, exist_ok = True)
|
||||||
|
|
||||||
LOG.info("Saving [%s]...", filepath)
|
LOG.info("Saving [%s]...", filepath)
|
||||||
with open(filepath, "w", encoding = "utf-8") as file:
|
with open(filepath, "w", encoding = "utf-8") as file:
|
||||||
if header:
|
if header:
|
||||||
|
|||||||
@@ -289,8 +289,12 @@ def sanitize_folder_name(name:str, max_length:int = 100) -> str:
|
|||||||
if not raw:
|
if not raw:
|
||||||
return "untitled"
|
return "untitled"
|
||||||
|
|
||||||
raw = unicodedata.normalize("NFC", raw)
|
# Apply sanitization, then normalize to NFC
|
||||||
|
# Note: sanitize-filename converts to NFD, so we must normalize AFTER sanitizing
|
||||||
|
# to ensure consistent NFC encoding across platforms (macOS HFS+, Linux, Windows)
|
||||||
|
# This prevents path mismatches when saving files to sanitized directories (issue #728)
|
||||||
safe:str = sanitize(raw)
|
safe:str = sanitize(raw)
|
||||||
|
safe = unicodedata.normalize("NFC", safe)
|
||||||
|
|
||||||
# Truncate with word-boundary preference
|
# Truncate with word-boundary preference
|
||||||
if len(safe) > max_length:
|
if len(safe) > max_length:
|
||||||
|
|||||||
40
tests/unit/test_dicts.py
Normal file
40
tests/unit/test_dicts.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||||
|
"""Tests for the dicts utility module."""
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_dict_normalizes_unicode_paths(tmp_path:Path) -> None:
|
||||||
|
"""Test that save_dict normalizes paths to NFC for cross-platform consistency (issue #728).
|
||||||
|
|
||||||
|
Directories are created with NFC normalization (via sanitize_folder_name).
|
||||||
|
This test verifies save_dict's defensive normalization handles edge cases where
|
||||||
|
an NFD path is passed (e.g., "ä" as "a" + combining diacritic vs single character).
|
||||||
|
It should normalize to NFC and use the existing NFC directory.
|
||||||
|
"""
|
||||||
|
from kleinanzeigen_bot.utils import dicts # noqa: PLC0415
|
||||||
|
|
||||||
|
# Create directory with NFC normalization (as sanitize_folder_name does)
|
||||||
|
title_nfc = unicodedata.normalize("NFC", "KitchenAid Zuhälter - nie benutzt")
|
||||||
|
nfc_dir = tmp_path / f"ad_12345_{title_nfc}"
|
||||||
|
nfc_dir.mkdir(parents = True)
|
||||||
|
|
||||||
|
# Call save_dict with NFD path (different normalization)
|
||||||
|
title_nfd = unicodedata.normalize("NFD", title_nfc)
|
||||||
|
assert title_nfc != title_nfd, "NFC and NFD should be different strings"
|
||||||
|
|
||||||
|
nfd_path = tmp_path / f"ad_12345_{title_nfd}" / "ad_12345.yaml"
|
||||||
|
dicts.save_dict(str(nfd_path), {"test": "data", "title": title_nfc})
|
||||||
|
|
||||||
|
# Verify file was saved successfully
|
||||||
|
nfc_files = list(nfc_dir.glob("*.yaml"))
|
||||||
|
assert len(nfc_files) == 1, "Should have exactly one file in NFC directory"
|
||||||
|
assert nfc_files[0].name == "ad_12345.yaml"
|
||||||
|
|
||||||
|
# On macOS/APFS, the filesystem normalizes both NFC and NFD to the same directory
|
||||||
|
# On Linux ext4, NFC normalization in save_dict ensures it uses the existing directory
|
||||||
|
# Either way, we should have exactly one YAML file total (no duplicates)
|
||||||
|
all_yaml_files = list(tmp_path.rglob("*.yaml"))
|
||||||
|
assert len(all_yaml_files) == 1, f"Expected exactly 1 YAML file total, found {len(all_yaml_files)}: {all_yaml_files}"
|
||||||
@@ -1225,3 +1225,71 @@ class TestAdExtractorDownload:
|
|||||||
assert result_dir.exists()
|
assert result_dir.exists()
|
||||||
assert (result_dir / "existing_image.jpg").exists() # File should be preserved
|
assert (result_dir / "existing_image.jpg").exists() # File should be preserved
|
||||||
assert ad_cfg.title == "Test Title"
|
assert ad_cfg.title == "Test Title"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_download_ad_with_umlauts_in_title(self, extractor:AdExtractor, tmp_path:Path) -> None:
|
||||||
|
"""Test cross-platform Unicode handling for ad titles with umlauts (issue #728).
|
||||||
|
|
||||||
|
Verifies that:
|
||||||
|
1. Directories are created with NFC-normalized names (via sanitize_folder_name)
|
||||||
|
2. Files can be saved to those directories (via save_dict's NFC normalization)
|
||||||
|
3. No FileNotFoundError occurs due to NFC/NFD mismatch on Linux/Windows
|
||||||
|
"""
|
||||||
|
# Title with German umlauts (ä) - common in real ads
|
||||||
|
title_with_umlauts = "KitchenAid Zuhälter - nie benutzt"
|
||||||
|
|
||||||
|
# Mock the page
|
||||||
|
page_mock = MagicMock()
|
||||||
|
page_mock.url = "https://www.kleinanzeigen.de/s-anzeige/test/12345"
|
||||||
|
extractor.page = page_mock
|
||||||
|
|
||||||
|
base_dir = tmp_path / "downloaded-ads"
|
||||||
|
base_dir.mkdir()
|
||||||
|
|
||||||
|
with patch.object(extractor, "web_text", new_callable = AsyncMock, side_effect = [
|
||||||
|
title_with_umlauts, # Title extraction
|
||||||
|
title_with_umlauts, # Second title call for full extraction
|
||||||
|
"Description text", # Description
|
||||||
|
"03.02.2025" # Creation date
|
||||||
|
]), \
|
||||||
|
patch.object(extractor, "web_execute", new_callable = AsyncMock, return_value = {
|
||||||
|
"universalAnalyticsOpts": {
|
||||||
|
"dimensions": {
|
||||||
|
"dimension92": "",
|
||||||
|
"dimension108": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}), \
|
||||||
|
patch.object(extractor, "_extract_category_from_ad_page", new_callable = AsyncMock, return_value = "160"), \
|
||||||
|
patch.object(extractor, "_extract_special_attributes_from_ad_page", new_callable = AsyncMock, return_value = {}), \
|
||||||
|
patch.object(extractor, "_extract_pricing_info_from_ad_page", new_callable = AsyncMock, return_value = (None, "NOT_APPLICABLE")), \
|
||||||
|
patch.object(extractor, "_extract_shipping_info_from_ad_page", new_callable = AsyncMock, return_value = ("NOT_APPLICABLE", None, None)), \
|
||||||
|
patch.object(extractor, "_extract_sell_directly_from_ad_page", new_callable = AsyncMock, return_value = False), \
|
||||||
|
patch.object(extractor, "_download_images_from_ad_page", new_callable = AsyncMock, return_value = []), \
|
||||||
|
patch.object(extractor, "_extract_contact_from_ad_page", new_callable = AsyncMock, return_value = ContactPartial(
|
||||||
|
name = "Test", zipcode = "12345", location = "Berlin"
|
||||||
|
)):
|
||||||
|
|
||||||
|
ad_cfg, result_dir = await extractor._extract_ad_page_info_with_directory_handling(
|
||||||
|
base_dir, 12345
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify directory was created with NFC-normalized name
|
||||||
|
assert result_dir.exists()
|
||||||
|
assert ad_cfg.title == title_with_umlauts
|
||||||
|
|
||||||
|
# Test saving YAML file to the Unicode directory path
|
||||||
|
# Before fix: Failed on Linux/Windows due to NFC/NFD mismatch
|
||||||
|
# After fix: Both directory and file use NFC normalization
|
||||||
|
ad_file_path = Path(result_dir) / "ad_12345.yaml"
|
||||||
|
|
||||||
|
from kleinanzeigen_bot.utils import dicts # noqa: PLC0415
|
||||||
|
|
||||||
|
header_string = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json"
|
||||||
|
|
||||||
|
# save_dict normalizes path to NFC, matching the NFC directory name
|
||||||
|
dicts.save_dict(str(ad_file_path), ad_cfg.model_dump(), header = header_string)
|
||||||
|
|
||||||
|
# Verify file was created successfully (no FileNotFoundError)
|
||||||
|
assert ad_file_path.exists()
|
||||||
|
assert ad_file_path.is_file()
|
||||||
|
|||||||
@@ -144,9 +144,9 @@ def test_ensure_non_callable_truthy_and_falsy() -> None:
|
|||||||
# Basic sanitization
|
# Basic sanitization
|
||||||
("My Ad Title!", "My Ad Title!", "Basic sanitization"),
|
("My Ad Title!", "My Ad Title!", "Basic sanitization"),
|
||||||
|
|
||||||
# Unicode normalization (sanitize-filename changes normalization)
|
# Unicode normalization - sanitize-filename converts to NFD, then we normalize to NFC (issue #728)
|
||||||
("café", "cafe\u0301", "Unicode normalization"),
|
("café", "café", "Unicode NFC → NFD (by sanitize) → NFC (by normalize)"),
|
||||||
("caf\u00e9", "cafe\u0301", "Unicode normalization from escaped"),
|
("caf\u00e9", "café", "Unicode NFC (escaped) → NFD → NFC"),
|
||||||
|
|
||||||
# Edge cases
|
# Edge cases
|
||||||
("", "untitled", "Empty string"),
|
("", "untitled", "Empty string"),
|
||||||
|
|||||||
Reference in New Issue
Block a user