From df24a675a92e150190296142d8690ce4afbdfe6e Mon Sep 17 00:00:00 2001
From: Jens Bergmann <1742418+1cu@users.noreply.github.com>
Date: Sun, 17 Aug 2025 17:49:00 +0200
Subject: [PATCH] fix: resolve #612 FileNotFoundError and improve ad download
 architecture (#613)

---
 src/kleinanzeigen_bot/extract.py              | 113 +++++++----
 .../resources/translations.de.yaml            |  12 +-
 tests/unit/test_extract.py                    | 181 ++++++++++--------
 3 files changed, 179 insertions(+), 127 deletions(-)

diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py
index 8796cd3..780c4cf 100644
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -45,36 +45,13 @@ class AdExtractor(WebScrapingMixin):
             os.mkdir(relative_directory)
             LOG.info("Created ads directory at ./%s.", relative_directory)
 
-        # First, extract ad info to get the title
-        temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
-        ad_cfg:AdPartial = await self._extract_ad_page_info(temp_dir, ad_id)
-
-        # Create folder name with ad title
-        sanitized_title = misc.sanitize_folder_name(ad_cfg.title, self.config.download.folder_name_max_length)
-        new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
-
-        # If the folder with title already exists, delete it
-        if os.path.exists(new_base_dir):
-            LOG.info("Deleting current folder of ad %s...", ad_id)
-            shutil.rmtree(new_base_dir)
-
-        # If the old folder without title exists, handle based on configuration
-        if os.path.exists(temp_dir):
-            if self.config.download.rename_existing_folders:
-                LOG.info("Renaming folder from %s to %s for ad %s...",
-                        os.path.basename(temp_dir), os.path.basename(new_base_dir), ad_id)
-                os.rename(temp_dir, new_base_dir)
-            else:
-                # Use the existing folder without renaming
-                new_base_dir = temp_dir
-                LOG.info("Using existing folder for ad %s at %s.", ad_id, new_base_dir)
-        else:
-            # Create new directory with title
-            os.mkdir(new_base_dir)
-            LOG.info("New directory for ad created at %s.", new_base_dir)
+        # Extract ad info and determine final directory path
+        ad_cfg, final_dir = await self._extract_ad_page_info_with_directory_handling(
+            relative_directory, ad_id
+        )
 
         # Save the ad configuration file
-        ad_file_path = new_base_dir + "/" + f"ad_{ad_id}.yaml"
+        ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml"
         dicts.save_dict(
             ad_file_path,
             ad_cfg.model_dump(),
@@ -283,23 +260,32 @@ class AdExtractor(WebScrapingMixin):
             pass
         return True
 
+    async def _extract_title_from_ad_page(self) -> str:
+        """
+        Extracts the title from an ad page.
+        Assumes that the web driver currently shows an ad page.
+
+        :return: the ad title
+        """
+        return await self.web_text(By.ID, "viewad-title")
+
     async def _extract_ad_page_info(self, directory:str, ad_id:int) -> AdPartial:
         """
-        Extracts all necessary information from an ad´s page.
+        Extracts ad information and downloads images to the specified directory.
+        NOTE: Requires that the driver session currently is on the ad page.
 
-        :param directory: the path of the ad´s previously created directory
-        :param ad_id: the ad ID, already extracted by a calling function
+        :param directory: the directory to download images to
+        :param ad_id: the ad ID
+        :return: an AdPartial object containing the ad information
         """
         info:dict[str, Any] = {"active": True}
 
         # extract basic info
         info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED"
-        title:str = await self.web_text(By.ID, "viewad-title")
-        LOG.info('Extracting information from ad with title "%s"', title)
 
-        # contains info about ad in different dimensions in form of a key:value dict
-        # dimension108 contains special attributes
-        # dimension92 contains fake category, which becomes an special attribute on ad page
+        # Extract title
+        title = await self._extract_title_from_ad_page()
+
         belen_conf = await self.web_execute("window.BelenConf")
 
         info["category"] = await self._extract_category_from_ad_page()
@@ -348,9 +334,9 @@ class AdExtractor(WebScrapingMixin):
 
         # convert creation date to ISO format
         created_parts = creation_date.split(".")
-        creation_date = created_parts[2] + "-" + created_parts[1] + "-" + created_parts[0] + " 00:00:00"
-        creation_date = datetime.fromisoformat(creation_date).isoformat()
-        info["created_on"] = creation_date
+        creation_date_str = created_parts[2] + "-" + created_parts[1] + "-" + created_parts[0] + " 00:00:00"
+        creation_date_dt = datetime.fromisoformat(creation_date_str)
+        info["created_on"] = creation_date_dt
         info["updated_on"] = None  # will be set later on
 
         ad_cfg = AdPartial.model_validate(info)
@@ -360,6 +346,55 @@ class AdExtractor(WebScrapingMixin):
 
         return ad_cfg
 
+    async def _extract_ad_page_info_with_directory_handling(
+        self, relative_directory:str, ad_id:int
+    ) -> tuple[AdPartial, str]:
+        """
+        Extracts ad information and handles directory creation/renaming.
+
+        :param relative_directory: Base directory for downloads
+        :param ad_id: The ad ID
+        :return: AdPartial with directory information
+        """
+        # First, extract basic info to get the title
+        info:dict[str, Any] = {"active": True}
+
+        # extract basic info
+        info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED"
+        title = await self._extract_title_from_ad_page()
+        LOG.info('Extracting title from ad %s: "%s"', ad_id, title)
+
+        # Determine the final directory path
+        sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length)
+        final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
+        temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
+
+        # Handle existing directories
+        if os.path.exists(final_dir):
+            # If the folder with title already exists, delete it
+            LOG.info("Deleting current folder of ad %s...", ad_id)
+            shutil.rmtree(final_dir)
+
+        if os.path.exists(temp_dir):
+            if self.config.download.rename_existing_folders:
+                # Rename the old folder to the new name with title
+                LOG.info("Renaming folder from %s to %s for ad %s...",
+                        os.path.basename(temp_dir), os.path.basename(final_dir), ad_id)
+                os.rename(temp_dir, final_dir)
+            else:
+                # Use the existing folder without renaming
+                final_dir = temp_dir
+                LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir)
+        else:
+            # Create new directory with title
+            os.mkdir(final_dir)
+            LOG.info("New directory for ad created at %s.", final_dir)
+
+        # Now extract complete ad info (including images) to the final directory
+        ad_cfg = await self._extract_ad_page_info(final_dir, ad_id)
+
+        return ad_cfg, final_dir
+
     async def _extract_category_from_ad_page(self) -> str:
         """
         Extracts a category of an ad in numerical form.
diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml
index 5d0b241..25c31bc 100644
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -174,10 +174,6 @@ kleinanzeigen_bot/extract.py:
 #################################################
   download_ad:
     "Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
-    "Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..."
-    "New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s."
-    "Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..."
-    "Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s."
 
   _download_images_from_ad_page:
     "Found %s.": "%s gefunden."
@@ -210,8 +206,12 @@ kleinanzeigen_bot/extract.py:
     "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID."
     "A popup appeared!": "Ein Popup ist erschienen!"
 
-  _extract_ad_page_info:
-    "Extracting information from ad with title \"%s\"": "Extrahiere Informationen aus Anzeige mit Titel \"%s\""
+  _extract_ad_page_info_with_directory_handling:
+    "Extracting title from ad %s: \"%s\"": "Extrahiere Titel aus Anzeige %s: \"%s\""
+    "Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..."
+    "New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s."
+    "Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..."
+    "Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s."
 
   _extract_contact_from_ad_page:
     "No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben."
diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py
index 56e2b28..f6735a2 100644
--- a/tests/unit/test_extract.py
+++ b/tests/unit/test_extract.py
@@ -735,7 +735,7 @@ class TestAdExtractorDownload:
                 patch("os.rename") as mock_rename, \
                 patch("shutil.rmtree") as mock_rmtree, \
                 patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
-                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+                patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
 
             base_dir = "downloaded-ads"
             final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
@@ -746,26 +746,32 @@ class TestAdExtractorDownload:
             mock_exists.side_effect = lambda path: path in existing_paths
             mock_isdir.side_effect = lambda path: path == base_dir
 
-            mock_extract.return_value = AdPartial.model_validate({
-                "title": "Test Advertisement Title",
-                "description": "Test Description",
-                "category": "Dienstleistungen",
-                "price": 100,
-                "images": [],
-                "contact": {
-                    "name": "Test User",
-                    "street": "Test Street 123",
-                    "zipcode": "12345",
-                    "location": "Test City"
-                }
-            })
+            # Mock the new method that handles directory creation and extraction
+            mock_extract_with_dir.return_value = (
+                AdPartial.model_validate({
+                    "title": "Test Advertisement Title",
+                    "description": "Test Description",
+                    "category": "Dienstleistungen",
+                    "price": 100,
+                    "images": [],
+                    "contact": {
+                        "name": "Test User",
+                        "street": "Test Street 123",
+                        "zipcode": "12345",
+                        "location": "Test City"
+                    }
+                }),
+                final_dir
+            )
 
             await extractor.download_ad(12345)
 
             # Verify the correct functions were called
-            mock_extract.assert_called_once()
-            mock_rmtree.assert_called_once_with(final_dir)  # Delete the final directory with title
-            mock_mkdir.assert_called_once_with(final_dir)  # Create the final directory with title
+            mock_extract_with_dir.assert_called_once()
+            # Directory handling is now done inside _extract_ad_page_info_with_directory_handling
+            # so we don't expect rmtree/mkdir to be called directly in download_ad
+            mock_rmtree.assert_not_called()  # Directory handling is done internally
+            mock_mkdir.assert_not_called()  # Directory handling is done internally
             mock_makedirs.assert_not_called()  # Directory already exists
             mock_rename.assert_not_called()  # No renaming needed
 
@@ -774,15 +780,7 @@ class TestAdExtractorDownload:
             assert actual_call is not None
             actual_path = actual_call[0][0].replace("/", os.path.sep)
             assert actual_path == yaml_path
-            assert actual_call[0][1] == mock_extract.return_value.model_dump()
-
-    @pytest.mark.asyncio
-    # pylint: disable=protected-access
-    async def test_download_images_no_images(self, extractor:AdExtractor) -> None:
-        """Test image download when no images are found."""
-        with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
-            image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
-            assert len(image_paths) == 0
+            assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
 
     @pytest.mark.asyncio
     async def test_download_ad(self, extractor:AdExtractor) -> None:
@@ -794,7 +792,7 @@ class TestAdExtractorDownload:
                 patch("os.rename") as mock_rename, \
                 patch("shutil.rmtree") as mock_rmtree, \
                 patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
-                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+                patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
 
             base_dir = "downloaded-ads"
             final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title")
@@ -804,29 +802,31 @@ class TestAdExtractorDownload:
             mock_exists.return_value = False
             mock_isdir.return_value = False
 
-            mock_extract.return_value = AdPartial.model_validate({
-                "title": "Test Advertisement Title",
-                "description": "Test Description",
-                "category": "Dienstleistungen",
-                "price": 100,
-                "images": [],
-                "contact": {
-                    "name": "Test User",
-                    "street": "Test Street 123",
-                    "zipcode": "12345",
-                    "location": "Test City"
-                }
-            })
+            # Mock the new method that handles directory creation and extraction
+            mock_extract_with_dir.return_value = (
+                AdPartial.model_validate({
+                    "title": "Test Advertisement Title",
+                    "description": "Test Description",
+                    "category": "Dienstleistungen",
+                    "price": 100,
+                    "images": [],
+                    "contact": {
+                        "name": "Test User",
+                        "street": "Test Street 123",
+                        "zipcode": "12345",
+                        "location": "Test City"
+                    }
+                }),
+                final_dir
+            )
 
             await extractor.download_ad(12345)
 
             # Verify the correct functions were called
-            mock_extract.assert_called_once()
-            mock_rmtree.assert_not_called()  # No directory to remove
-            mock_mkdir.assert_has_calls([
-                call(base_dir),
-                call(final_dir)  # Create the final directory with title
-            ])
+            mock_extract_with_dir.assert_called_once()
+            # Directory handling is now done inside _extract_ad_page_info_with_directory_handling
+            mock_rmtree.assert_not_called()  # Directory handling is done internally
+            mock_mkdir.assert_has_calls([call(base_dir)])  # Only base directory creation
             mock_makedirs.assert_not_called()  # Using mkdir instead
             mock_rename.assert_not_called()  # No renaming needed
 
@@ -835,7 +835,7 @@ class TestAdExtractorDownload:
             assert actual_call is not None
             actual_path = actual_call[0][0].replace("/", os.path.sep)
             assert actual_path == yaml_path
-            assert actual_call[0][1] == mock_extract.return_value.model_dump()
+            assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
 
     @pytest.mark.asyncio
     async def test_download_ad_use_existing_folder(self, extractor:AdExtractor) -> None:
@@ -847,7 +847,7 @@ class TestAdExtractorDownload:
                 patch("os.rename") as mock_rename, \
                 patch("shutil.rmtree") as mock_rmtree, \
                 patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
-                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+                patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
 
             base_dir = "downloaded-ads"
             temp_dir = os.path.join(base_dir, "ad_12345")
@@ -859,24 +859,28 @@ class TestAdExtractorDownload:
             mock_exists.side_effect = lambda path: path in existing_paths
             mock_isdir.side_effect = lambda path: path == base_dir
 
-            mock_extract.return_value = AdPartial.model_validate({
-                "title": "Test Advertisement Title",
-                "description": "Test Description",
-                "category": "Dienstleistungen",
-                "price": 100,
-                "images": [],
-                "contact": {
-                    "name": "Test User",
-                    "street": "Test Street 123",
-                    "zipcode": "12345",
-                    "location": "Test City"
-                }
-            })
+            # Mock the new method that handles directory creation and extraction
+            mock_extract_with_dir.return_value = (
+                AdPartial.model_validate({
+                    "title": "Test Advertisement Title",
+                    "description": "Test Description",
+                    "category": "Dienstleistungen",
+                    "price": 100,
+                    "images": [],
+                    "contact": {
+                        "name": "Test User",
+                        "street": "Test Street 123",
+                        "zipcode": "12345",
+                        "location": "Test City"
+                    }
+                }),
+                temp_dir  # Use existing temp directory
+            )
 
             await extractor.download_ad(12345)
 
             # Verify the correct functions were called
-            mock_extract.assert_called_once()
+            mock_extract_with_dir.assert_called_once()
             mock_rmtree.assert_not_called()  # No directory to remove
             mock_mkdir.assert_not_called()  # Base directory already exists
             mock_makedirs.assert_not_called()  # Using mkdir instead
@@ -887,7 +891,7 @@ class TestAdExtractorDownload:
             assert actual_call is not None
             actual_path = actual_call[0][0].replace("/", os.path.sep)
             assert actual_path == yaml_path
-            assert actual_call[0][1] == mock_extract.return_value.model_dump()
+            assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
 
     @pytest.mark.asyncio
     async def test_download_ad_rename_existing_folder_when_enabled(self, extractor:AdExtractor) -> None:
@@ -902,7 +906,7 @@ class TestAdExtractorDownload:
                 patch("os.rename") as mock_rename, \
                 patch("shutil.rmtree") as mock_rmtree, \
                 patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \
-                patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract:
+                patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir:
 
             base_dir = "downloaded-ads"
             temp_dir = os.path.join(base_dir, "ad_12345")
@@ -915,32 +919,45 @@ class TestAdExtractorDownload:
             mock_exists.side_effect = lambda path: path in existing_paths
             mock_isdir.side_effect = lambda path: path == base_dir
 
-            mock_extract.return_value = AdPartial.model_validate({
-                "title": "Test Advertisement Title",
-                "description": "Test Description",
-                "category": "Dienstleistungen",
-                "price": 100,
-                "images": [],
-                "contact": {
-                    "name": "Test User",
-                    "street": "Test Street 123",
-                    "zipcode": "12345",
-                    "location": "Test City"
-                }
-            })
+            # Mock the new method that handles directory creation and extraction
+            mock_extract_with_dir.return_value = (
+                AdPartial.model_validate({
+                    "title": "Test Advertisement Title",
+                    "description": "Test Description",
+                    "category": "Dienstleistungen",
+                    "price": 100,
+                    "images": [],
+                    "contact": {
+                        "name": "Test User",
+                        "street": "Test Street 123",
+                        "zipcode": "12345",
+                        "location": "Test City"
+                    }
+                }),
+                final_dir
+            )
 
             await extractor.download_ad(12345)
 
             # Verify the correct functions were called
-            mock_extract.assert_called_once()
-            mock_rmtree.assert_not_called()  # No directory to remove
-            mock_mkdir.assert_not_called()  # Base directory already exists
+            mock_extract_with_dir.assert_called_once()  # Extract to final directory
+            # Directory handling (including renaming) is now done inside _extract_ad_page_info_with_directory_handling
+            mock_rmtree.assert_not_called()  # Directory handling is done internally
+            mock_mkdir.assert_not_called()  # Directory handling is done internally
             mock_makedirs.assert_not_called()  # Using mkdir instead
-            mock_rename.assert_called_once_with(temp_dir, final_dir)  # Rename temp to final
+            mock_rename.assert_not_called()  # Directory handling is done internally
 
             # Get the actual call arguments
             actual_call = mock_save_dict.call_args
             assert actual_call is not None
             actual_path = actual_call[0][0].replace("/", os.path.sep)
             assert actual_path == yaml_path
-            assert actual_call[0][1] == mock_extract.return_value.model_dump()
+            assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump()
+
+    @pytest.mark.asyncio
+    # pylint: disable=protected-access
+    async def test_download_images_no_images(self, extractor:AdExtractor) -> None:
+        """Test image download when no images are found."""
+        with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError):
+            image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345)
+            assert len(image_paths) == 0