feat: enhanced folder naming (#599)

2026-03-12 02:31:45 +01:00 · 2025-08-12 10:43:26 +02:00
parent 1e0c7216ad
commit 91a40b0116
11 changed files with 369 additions and 25 deletions
--- a/tests/unit/test_extract.py
+++ b/tests/unit/test_extract.py
@@ -732,16 +732,17 @@ class TestAdExtractorDownload:
                patch("os.path.isdir") as mock_isdir, \
                patch("os.makedirs") as mock_makedirs, \
                patch("os.mkdir") as mock_mkdir, \
+                patch("os.rename") as mock_rename, \
                patch("shutil.rmtree") as mock_rmtree, \
                patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:

            base_dir = "downloaded-ads"
-            ad_dir = os.path.join(base_dir, "ad_12345")
-            yaml_path = os.path.join(ad_dir, "ad_12345.yaml")
+            final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
+            yaml_path = os.path.join(final_dir, "ad_12345.yaml")

            # Configure mocks for directory checks
-            existing_paths = {base_dir, ad_dir}
+            existing_paths = {base_dir, final_dir}  # Final directory with title exists
            mock_exists.side_effect = lambda path: path in existing_paths
            mock_isdir.side_effect = lambda path: path == base_dir

@@ -763,12 +764,12 @@ class TestAdExtractorDownload:

            # Verify the correct functions were called
            mock_extract.assert_called_once()
-            mock_rmtree.assert_called_once_with(ad_dir)
-            mock_mkdir.assert_called_once_with(ad_dir)
+            mock_rmtree.assert_called_once_with(final_dir)  # Delete the final directory with title
+            mock_mkdir.assert_called_once_with(final_dir)  # Create the final directory with title
            mock_makedirs.assert_not_called()  # Directory already exists
+            mock_rename.assert_not_called()  # No renaming needed

            # Get the actual call arguments
-            # Workaround for hard-coded path in download_ad
            actual_call = mock_save_dict.call_args
            assert actual_call is not None
            actual_path = actual_call[0][0].replace("/", os.path.sep)
@@ -790,13 +791,14 @@ class TestAdExtractorDownload:
                patch("os.path.isdir") as mock_isdir, \
                patch("os.makedirs") as mock_makedirs, \
                patch("os.mkdir") as mock_mkdir, \
+                patch("os.rename") as mock_rename, \
                patch("shutil.rmtree") as mock_rmtree, \
                patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:

            base_dir = "downloaded-ads"
-            ad_dir = os.path.join(base_dir, "ad_12345")
-            yaml_path = os.path.join(ad_dir, "ad_12345.yaml")
+            final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
+            yaml_path = os.path.join(final_dir, "ad_12345.yaml")

            # Configure mocks for directory checks
            mock_exists.return_value = False
@@ -823,9 +825,118 @@ class TestAdExtractorDownload:
            mock_rmtree.assert_not_called()  # No directory to remove
            mock_mkdir.assert_has_calls([
                call(base_dir),
-                call(ad_dir)
+                call(final_dir)  # Create the final directory with title
            ])
            mock_makedirs.assert_not_called()  # Using mkdir instead
+            mock_rename.assert_not_called()  # No renaming needed
+
+            # Get the actual call arguments
+            actual_call = mock_save_dict.call_args
+            assert actual_call is not None
+            actual_path = actual_call[0][0].replace("/", os.path.sep)
+            assert actual_path == yaml_path
+            assert actual_call[0][1] == mock_extract.return_value.model_dump()
+
+    @pytest.mark.asyncio
+    async def test_download_ad_use_existing_folder(self, extractor:AdExtractor) -> None:
+        """Test downloading an ad when an old folder without title exists (default behavior)."""
+        with patch("os.path.exists") as mock_exists, \
+                patch("os.path.isdir") as mock_isdir, \
+                patch("os.makedirs") as mock_makedirs, \
+                patch("os.mkdir") as mock_mkdir, \
+                patch("os.rename") as mock_rename, \
+                patch("shutil.rmtree") as mock_rmtree, \
+                patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
+                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+
+            base_dir = "downloaded-ads"
+            temp_dir = os.path.join(base_dir, "ad_12345")
+            yaml_path = os.path.join(temp_dir, "ad_12345.yaml")
+
+            # Configure mocks for directory checks
+            # Base directory exists, temp directory exists
+            existing_paths = {base_dir, temp_dir}
+            mock_exists.side_effect = lambda path: path in existing_paths
+            mock_isdir.side_effect = lambda path: path == base_dir
+
+            mock_extract.return_value = AdPartial.model_validate({
+                "title": "Test Advertisement Title",
+                "description": "Test Description",
+                "category": "Dienstleistungen",
+                "price": 100,
+                "images": [],
+                "contact": {
+                    "name": "Test User",
+                    "street": "Test Street 123",
+                    "zipcode": "12345",
+                    "location": "Test City"
+                }
+            })
+
+            await extractor.download_ad(12345)
+
+            # Verify the correct functions were called
+            mock_extract.assert_called_once()
+            mock_rmtree.assert_not_called()  # No directory to remove
+            mock_mkdir.assert_not_called()  # Base directory already exists
+            mock_makedirs.assert_not_called()  # Using mkdir instead
+            mock_rename.assert_not_called()  # No renaming (default behavior)
+
+            # Get the actual call arguments
+            actual_call = mock_save_dict.call_args
+            assert actual_call is not None
+            actual_path = actual_call[0][0].replace("/", os.path.sep)
+            assert actual_path == yaml_path
+            assert actual_call[0][1] == mock_extract.return_value.model_dump()
+
+    @pytest.mark.asyncio
+    async def test_download_ad_rename_existing_folder_when_enabled(self, extractor:AdExtractor) -> None:
+        """Test downloading an ad when an old folder without title exists and renaming is enabled."""
+        # Enable renaming in config
+        extractor.config.download.rename_existing_folders = True
+
+        with patch("os.path.exists") as mock_exists, \
+                patch("os.path.isdir") as mock_isdir, \
+                patch("os.makedirs") as mock_makedirs, \
+                patch("os.mkdir") as mock_mkdir, \
+                patch("os.rename") as mock_rename, \
+                patch("shutil.rmtree") as mock_rmtree, \
+                patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
+                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+
+            base_dir = "downloaded-ads"
+            temp_dir = os.path.join(base_dir, "ad_12345")
+            final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
+            yaml_path = os.path.join(final_dir, "ad_12345.yaml")
+
+            # Configure mocks for directory checks
+            # Base directory exists, temp directory exists, final directory doesn't exist
+            existing_paths = {base_dir, temp_dir}
+            mock_exists.side_effect = lambda path: path in existing_paths
+            mock_isdir.side_effect = lambda path: path == base_dir
+
+            mock_extract.return_value = AdPartial.model_validate({
+                "title": "Test Advertisement Title",
+                "description": "Test Description",
+                "category": "Dienstleistungen",
+                "price": 100,
+                "images": [],
+                "contact": {
+                    "name": "Test User",
+                    "street": "Test Street 123",
+                    "zipcode": "12345",
+                    "location": "Test City"
+                }
+            })
+
+            await extractor.download_ad(12345)
+
+            # Verify the correct functions were called
+            mock_extract.assert_called_once()
+            mock_rmtree.assert_not_called()  # No directory to remove
+            mock_mkdir.assert_not_called()  # Base directory already exists
+            mock_makedirs.assert_not_called()  # Using mkdir instead
+            mock_rename.assert_called_once_with(temp_dir, final_dir)  # Rename temp to final

            # Get the actual call arguments
            actual_call = mock_save_dict.call_args
--- a/tests/unit/test_update_checker.py
+++ b/tests/unit/test_update_checker.py
@@ -234,13 +234,13 @@ class TestUpdateChecker:

    def test_update_check_state_invalid_data(self, state_file:Path) -> None:
        """Test that loading invalid state data returns a new state."""
-        state_file.write_text("invalid json")
+        state_file.write_text("invalid json", encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.last_check is None

    def test_update_check_state_missing_last_check(self, state_file:Path) -> None:
        """Test that loading state data without last_check returns a new state."""
-        state_file.write_text("{}")
+        state_file.write_text("{}", encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.last_check is None

@@ -371,7 +371,7 @@ class TestUpdateChecker:

    def test_update_check_state_invalid_date(self, state_file:Path) -> None:
        """Test that loading a state file with an invalid date string for last_check returns a new state (triggers ValueError)."""
-        state_file.write_text(json.dumps({"last_check": "not-a-date"}))
+        state_file.write_text(json.dumps({"last_check": "not-a-date"}), encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.last_check is None

@@ -471,7 +471,7 @@ class TestUpdateChecker:
        # Create a state with version 0 (old format)
        state_file.write_text(json.dumps({
            "last_check": datetime.now(timezone.utc).isoformat()
-        }))
+        }), encoding = "utf-8")

        # Load the state - should migrate to version 1
        state = UpdateCheckState.load(state_file)
@@ -490,7 +490,7 @@ class TestUpdateChecker:
        old_time = datetime.now(timezone.utc)
        state_file.write_text(json.dumps({
            "last_check": old_time.isoformat()
-        }))
+        }), encoding = "utf-8")

        # Load the state - should migrate to version 1
        state = UpdateCheckState.load(state_file)
@@ -522,7 +522,7 @@ class TestUpdateChecker:
    def test_update_check_state_load_errors(self, state_file:Path) -> None:
        """Test that load errors are handled gracefully."""
        # Test invalid JSON
-        state_file.write_text("invalid json")
+        state_file.write_text("invalid json", encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.version == 1
        assert state.last_check is None
@@ -531,7 +531,7 @@ class TestUpdateChecker:
        state_file.write_text(json.dumps({
            "version": 1,
            "last_check": "invalid-date"
-        }))
+        }), encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.version == 1
        assert state.last_check is None
@@ -542,7 +542,7 @@ class TestUpdateChecker:
        state_file.write_text(json.dumps({
            "version": 1,
            "last_check": "2024-03-20T12:00:00"
-        }))
+        }), encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.last_check is not None
        assert state.last_check.tzinfo == timezone.utc
@@ -552,7 +552,7 @@ class TestUpdateChecker:
        state_file.write_text(json.dumps({
            "version": 1,
            "last_check": "2024-03-20T12:00:00+02:00"  # 2 hours ahead of UTC
-        }))
+        }), encoding = "utf-8")
        state = UpdateCheckState.load(state_file)
        assert state.last_check is not None
        assert state.last_check.tzinfo == timezone.utc
--- a/tests/unit/test_utils_misc.py
+++ b/tests/unit/test_utils_misc.py
@@ -7,8 +7,10 @@ import sys
 from datetime import datetime, timedelta, timezone

 import pytest
+from sanitize_filename import sanitize

 from kleinanzeigen_bot.utils import misc
+from kleinanzeigen_bot.utils.misc import sanitize_folder_name


 def test_now_returns_utc_datetime() -> None:
@@ -133,3 +135,133 @@ def test_ensure_non_callable_truthy_and_falsy() -> None:
        misc.ensure("", "Should fail for empty string")
    with pytest.raises(AssertionError):
        misc.ensure(None, "Should fail for None")
+
+
+# --- Test sanitize_folder_name function ---
+@pytest.mark.parametrize(
+    ("test_input", "expected_output", "description"),
+    [
+        # Basic sanitization
+        ("My Ad Title!", "My Ad Title!", "Basic sanitization"),
+
+        # Unicode normalization (sanitize-filename changes normalization)
+        ("café", "cafe\u0301", "Unicode normalization"),
+        ("caf\u00e9", "cafe\u0301", "Unicode normalization from escaped"),
+
+        # Edge cases
+        ("", "untitled", "Empty string"),
+        ("   ", "untitled", "Whitespace only"),
+        ("___", "___", "Multiple underscores (not collapsed)"),
+
+        # Control characters (removed by sanitize-filename)
+        ("Ad\x00with\x1fcontrol", "Adwithcontrol", "Control characters removed"),
+
+        # Multiple consecutive underscores (sanitize-filename doesn't collapse them)
+        ("Ad___with___multiple___underscores", "Ad___with___multiple___underscores", "Multiple underscores preserved"),
+
+        # Special characters (removed by sanitize-filename)
+        ('file<with>invalid:chars"|?*', "filewithinvalidchars", "Special characters removed"),
+        ("file\\with\\backslashes", "filewithbackslashes", "Backslashes removed"),
+        ("file/with/slashes", "filewithslashes", "Forward slashes removed"),
+
+        # Path traversal attempts (handled by sanitize-filename)
+        ("Title with ../../etc/passwd", "Title with ....etcpasswd", "Path traversal attempt"),
+        ("Title with C:\\Windows\\System32\\cmd.exe", "Title with CWindowsSystem32cmd.exe", "Windows path traversal"),
+
+        # XSS attempts (handled by sanitize-filename)
+        ('Title with <script>alert("xss")</script>', "Title with scriptalert(xss)script", "XSS attempt"),
+    ],
+)
+def test_sanitize_folder_name_basic(test_input:str, expected_output:str, description:str) -> None:
+    """Test sanitize_folder_name function with various inputs."""
+    result = sanitize_folder_name(test_input)
+    assert result == expected_output, f"Failed for '{test_input}': {description}"
+
+
+@pytest.mark.parametrize(
+    ("test_input", "max_length", "expected_output", "description"),
+    [
+        # Length truncation
+        ("Very long advertisement title that exceeds the maximum length and should be truncated", 50,
+         "Very long advertisement title that exceeds the", "Length truncation"),
+
+        # Word boundary truncation
+        ("Short words but very long title", 20, "Short words but", "Word boundary truncation"),
+
+        # Edge case: no word boundary found
+        ("VeryLongWordWithoutSpaces", 10, "VeryLongWo", "No word boundary truncation"),
+
+        # Test default max_length (100)
+        ("This is a reasonable advertisement title that fits within the default limit", 100,
+         "This is a reasonable advertisement title that fits within the default limit", "Default max_length"),
+    ],
+)
+def test_sanitize_folder_name_truncation(test_input:str, max_length:int, expected_output:str, description:str) -> None:
+    """Test sanitize_folder_name function with length truncation."""
+    result = sanitize_folder_name(test_input, max_length = max_length)
+    assert len(result) <= max_length, f"Result exceeds max_length for '{test_input}': {description}"
+    assert result == expected_output, f"Failed for '{test_input}' with max_length={max_length}: {description}"
+
+
+# --- Test sanitize-filename behavior directly (since it's consistent across platforms) ---
+@pytest.mark.parametrize(
+    ("test_input", "expected_output"),
+    [
+        # Test sanitize-filename behavior (consistent across platforms)
+        ("test/file", "testfile"),
+        ("test\\file", "testfile"),
+        ("test<file", "testfile"),
+        ("test>file", "testfile"),
+        ('test"file', "testfile"),
+        ("test|file", "testfile"),
+        ("test?file", "testfile"),
+        ("test*file", "testfile"),
+        ("test:file", "testfile"),
+        ("CON", "__CON"),
+        ("PRN", "__PRN"),
+        ("AUX", "__AUX"),
+        ("NUL", "__NUL"),
+        ("COM1", "__COM1"),
+        ("LPT1", "__LPT1"),
+        ("file/with/slashes", "filewithslashes"),
+        ("file\\with\\backslashes", "filewithbackslashes"),
+        ('file<with>invalid:chars"|?*', "filewithinvalidchars"),
+        ("file\x00with\x1fcontrol", "filewithcontrol"),
+        ("file___with___underscores", "file___with___underscores"),
+    ],
+)
+def test_sanitize_filename_behavior(test_input:str, expected_output:str) -> None:
+    """Test sanitize-filename behavior directly (consistent across platforms)."""
+    result = sanitize(test_input)
+    assert result == expected_output, f"sanitize-filename behavior mismatch for '{test_input}'"
+
+
+# --- Test sanitize_folder_name cross-platform consistency ---
+@pytest.mark.parametrize(
+    "test_input",
+    [
+        "normal_filename",
+        "filename with spaces",
+        "filename_with_underscores",
+        "filename-with-dashes",
+        "filename.with.dots",
+        "filename123",
+        "café_filename",
+        "filename\x00with\x1fcontrol",  # Control characters
+    ],
+)
+def test_sanitize_folder_name_cross_platform_consistency(
+    monkeypatch:pytest.MonkeyPatch,
+    test_input:str
+) -> None:
+    """Test that sanitize_folder_name produces consistent results across platforms for safe inputs."""
+    platforms = ["Windows", "Darwin", "Linux"]
+    results = []
+
+    for platform in platforms:
+        monkeypatch.setattr("sys.platform", platform.lower())
+        result = sanitize_folder_name(test_input)
+        results.append(result)
+
+    # All platforms should produce the same result for safe inputs
+    assert len(set(results)) == 1, f"Cross-platform inconsistency for '{test_input}': {results}"