From df24a675a92e150190296142d8690ce4afbdfe6e Mon Sep 17 00:00:00 2001 From: Jens Bergmann <1742418+1cu@users.noreply.github.com> Date: Sun, 17 Aug 2025 17:49:00 +0200 Subject: [PATCH] fix: resolve #612 FileNotFoundError and improve ad download architecture (#613) --- src/kleinanzeigen_bot/extract.py | 113 +++++++---- .../resources/translations.de.yaml | 12 +- tests/unit/test_extract.py | 181 ++++++++++-------- 3 files changed, 179 insertions(+), 127 deletions(-) diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index 8796cd3..780c4cf 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -45,36 +45,13 @@ class AdExtractor(WebScrapingMixin): os.mkdir(relative_directory) LOG.info("Created ads directory at ./%s.", relative_directory) - # First, extract ad info to get the title - temp_dir = os.path.join(relative_directory, f"ad_{ad_id}") - ad_cfg:AdPartial = await self._extract_ad_page_info(temp_dir, ad_id) - - # Create folder name with ad title - sanitized_title = misc.sanitize_folder_name(ad_cfg.title, self.config.download.folder_name_max_length) - new_base_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}") - - # If the folder with title already exists, delete it - if os.path.exists(new_base_dir): - LOG.info("Deleting current folder of ad %s...", ad_id) - shutil.rmtree(new_base_dir) - - # If the old folder without title exists, handle based on configuration - if os.path.exists(temp_dir): - if self.config.download.rename_existing_folders: - LOG.info("Renaming folder from %s to %s for ad %s...", - os.path.basename(temp_dir), os.path.basename(new_base_dir), ad_id) - os.rename(temp_dir, new_base_dir) - else: - # Use the existing folder without renaming - new_base_dir = temp_dir - LOG.info("Using existing folder for ad %s at %s.", ad_id, new_base_dir) - else: - # Create new directory with title - os.mkdir(new_base_dir) - LOG.info("New directory for ad created at %s.", new_base_dir) + # Extract ad info and determine final directory path + ad_cfg, final_dir = await self._extract_ad_page_info_with_directory_handling( + relative_directory, ad_id + ) # Save the ad configuration file - ad_file_path = new_base_dir + "/" + f"ad_{ad_id}.yaml" + ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml" dicts.save_dict( ad_file_path, ad_cfg.model_dump(), @@ -283,23 +260,32 @@ class AdExtractor(WebScrapingMixin): pass return True + async def _extract_title_from_ad_page(self) -> str: + """ + Extracts the title from an ad page. + Assumes that the web driver currently shows an ad page. + + :return: the ad title + """ + return await self.web_text(By.ID, "viewad-title") + async def _extract_ad_page_info(self, directory:str, ad_id:int) -> AdPartial: """ - Extracts all necessary information from an ad´s page. + Extracts ad information and downloads images to the specified directory. + NOTE: Requires that the driver session currently is on the ad page. - :param directory: the path of the ad´s previously created directory - :param ad_id: the ad ID, already extracted by a calling function + :param directory: the directory to download images to + :param ad_id: the ad ID + :return: an AdPartial object containing the ad information """ info:dict[str, Any] = {"active": True} # extract basic info info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED" - title:str = await self.web_text(By.ID, "viewad-title") - LOG.info('Extracting information from ad with title "%s"', title) - # contains info about ad in different dimensions in form of a key:value dict - # dimension108 contains special attributes - # dimension92 contains fake category, which becomes an special attribute on ad page + # Extract title + title = await self._extract_title_from_ad_page() + belen_conf = await self.web_execute("window.BelenConf") info["category"] = await self._extract_category_from_ad_page() @@ -348,9 +334,9 @@ class AdExtractor(WebScrapingMixin): # convert creation date to ISO format created_parts = creation_date.split(".") - creation_date = created_parts[2] + "-" + created_parts[1] + "-" + created_parts[0] + " 00:00:00" - creation_date = datetime.fromisoformat(creation_date).isoformat() - info["created_on"] = creation_date + creation_date_str = created_parts[2] + "-" + created_parts[1] + "-" + created_parts[0] + " 00:00:00" + creation_date_dt = datetime.fromisoformat(creation_date_str) + info["created_on"] = creation_date_dt info["updated_on"] = None # will be set later on ad_cfg = AdPartial.model_validate(info) @@ -360,6 +346,55 @@ class AdExtractor(WebScrapingMixin): return ad_cfg + async def _extract_ad_page_info_with_directory_handling( + self, relative_directory:str, ad_id:int + ) -> tuple[AdPartial, str]: + """ + Extracts ad information and handles directory creation/renaming. + + :param relative_directory: Base directory for downloads + :param ad_id: The ad ID + :return: AdPartial with directory information + """ + # First, extract basic info to get the title + info:dict[str, Any] = {"active": True} + + # extract basic info + info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED" + title = await self._extract_title_from_ad_page() + LOG.info('Extracting title from ad %s: "%s"', ad_id, title) + + # Determine the final directory path + sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length) + final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}") + temp_dir = os.path.join(relative_directory, f"ad_{ad_id}") + + # Handle existing directories + if os.path.exists(final_dir): + # If the folder with title already exists, delete it + LOG.info("Deleting current folder of ad %s...", ad_id) + shutil.rmtree(final_dir) + + if os.path.exists(temp_dir): + if self.config.download.rename_existing_folders: + # Rename the old folder to the new name with title + LOG.info("Renaming folder from %s to %s for ad %s...", + os.path.basename(temp_dir), os.path.basename(final_dir), ad_id) + os.rename(temp_dir, final_dir) + else: + # Use the existing folder without renaming + final_dir = temp_dir + LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir) + else: + # Create new directory with title + os.mkdir(final_dir) + LOG.info("New directory for ad created at %s.", final_dir) + + # Now extract complete ad info (including images) to the final directory + ad_cfg = await self._extract_ad_page_info(final_dir, ad_id) + + return ad_cfg, final_dir + async def _extract_category_from_ad_page(self) -> str: """ Extracts a category of an ad in numerical form. diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index 5d0b241..25c31bc 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -174,10 +174,6 @@ kleinanzeigen_bot/extract.py: ################################################# download_ad: "Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s." - "Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..." - "New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s." - "Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..." - "Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s." _download_images_from_ad_page: "Found %s.": "%s gefunden." @@ -210,8 +206,12 @@ kleinanzeigen_bot/extract.py: "There is no ad under the given ID.": "Es gibt keine Anzeige unter der angegebenen ID." "A popup appeared!": "Ein Popup ist erschienen!" - _extract_ad_page_info: - "Extracting information from ad with title \"%s\"": "Extrahiere Informationen aus Anzeige mit Titel \"%s\"" + _extract_ad_page_info_with_directory_handling: + "Extracting title from ad %s: \"%s\"": "Extrahiere Titel aus Anzeige %s: \"%s\"" + "Deleting current folder of ad %s...": "Lösche aktuellen Ordner der Anzeige %s..." + "New directory for ad created at %s.": "Neues Verzeichnis für Anzeige erstellt unter %s." + "Renaming folder from %s to %s for ad %s...": "Benenne Ordner von %s zu %s für Anzeige %s um..." + "Using existing folder for ad %s at %s.": "Verwende bestehenden Ordner für Anzeige %s unter %s." _extract_contact_from_ad_page: "No street given in the contact.": "Keine Straße in den Kontaktdaten angegeben." diff --git a/tests/unit/test_extract.py b/tests/unit/test_extract.py index 56e2b28..f6735a2 100644 --- a/tests/unit/test_extract.py +++ b/tests/unit/test_extract.py @@ -735,7 +735,7 @@ class TestAdExtractorDownload: patch("os.rename") as mock_rename, \ patch("shutil.rmtree") as mock_rmtree, \ patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \ - patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract: + patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir: base_dir = "downloaded-ads" final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title") @@ -746,26 +746,32 @@ class TestAdExtractorDownload: mock_exists.side_effect = lambda path: path in existing_paths mock_isdir.side_effect = lambda path: path == base_dir - mock_extract.return_value = AdPartial.model_validate({ - "title": "Test Advertisement Title", - "description": "Test Description", - "category": "Dienstleistungen", - "price": 100, - "images": [], - "contact": { - "name": "Test User", - "street": "Test Street 123", - "zipcode": "12345", - "location": "Test City" - } - }) + # Mock the new method that handles directory creation and extraction + mock_extract_with_dir.return_value = ( + AdPartial.model_validate({ + "title": "Test Advertisement Title", + "description": "Test Description", + "category": "Dienstleistungen", + "price": 100, + "images": [], + "contact": { + "name": "Test User", + "street": "Test Street 123", + "zipcode": "12345", + "location": "Test City" + } + }), + final_dir + ) await extractor.download_ad(12345) # Verify the correct functions were called - mock_extract.assert_called_once() - mock_rmtree.assert_called_once_with(final_dir) # Delete the final directory with title - mock_mkdir.assert_called_once_with(final_dir) # Create the final directory with title + mock_extract_with_dir.assert_called_once() + # Directory handling is now done inside _extract_ad_page_info_with_directory_handling + # so we don't expect rmtree/mkdir to be called directly in download_ad + mock_rmtree.assert_not_called() # Directory handling is done internally + mock_mkdir.assert_not_called() # Directory handling is done internally mock_makedirs.assert_not_called() # Directory already exists mock_rename.assert_not_called() # No renaming needed @@ -774,15 +780,7 @@ class TestAdExtractorDownload: assert actual_call is not None actual_path = actual_call[0][0].replace("/", os.path.sep) assert actual_path == yaml_path - assert actual_call[0][1] == mock_extract.return_value.model_dump() - - @pytest.mark.asyncio - # pylint: disable=protected-access - async def test_download_images_no_images(self, extractor:AdExtractor) -> None: - """Test image download when no images are found.""" - with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError): - image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345) - assert len(image_paths) == 0 + assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump() @pytest.mark.asyncio async def test_download_ad(self, extractor:AdExtractor) -> None: @@ -794,7 +792,7 @@ class TestAdExtractorDownload: patch("os.rename") as mock_rename, \ patch("shutil.rmtree") as mock_rmtree, \ patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \ - patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract: + patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir: base_dir = "downloaded-ads" final_dir = os.path.join(base_dir, "ad_12345_Test Advertisement Title") @@ -804,29 +802,31 @@ class TestAdExtractorDownload: mock_exists.return_value = False mock_isdir.return_value = False - mock_extract.return_value = AdPartial.model_validate({ - "title": "Test Advertisement Title", - "description": "Test Description", - "category": "Dienstleistungen", - "price": 100, - "images": [], - "contact": { - "name": "Test User", - "street": "Test Street 123", - "zipcode": "12345", - "location": "Test City" - } - }) + # Mock the new method that handles directory creation and extraction + mock_extract_with_dir.return_value = ( + AdPartial.model_validate({ + "title": "Test Advertisement Title", + "description": "Test Description", + "category": "Dienstleistungen", + "price": 100, + "images": [], + "contact": { + "name": "Test User", + "street": "Test Street 123", + "zipcode": "12345", + "location": "Test City" + } + }), + final_dir + ) await extractor.download_ad(12345) # Verify the correct functions were called - mock_extract.assert_called_once() - mock_rmtree.assert_not_called() # No directory to remove - mock_mkdir.assert_has_calls([ - call(base_dir), - call(final_dir) # Create the final directory with title - ]) + mock_extract_with_dir.assert_called_once() + # Directory handling is now done inside _extract_ad_page_info_with_directory_handling + mock_rmtree.assert_not_called() # Directory handling is done internally + mock_mkdir.assert_has_calls([call(base_dir)]) # Only base directory creation mock_makedirs.assert_not_called() # Using mkdir instead mock_rename.assert_not_called() # No renaming needed @@ -835,7 +835,7 @@ class TestAdExtractorDownload: assert actual_call is not None actual_path = actual_call[0][0].replace("/", os.path.sep) assert actual_path == yaml_path - assert actual_call[0][1] == mock_extract.return_value.model_dump() + assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump() @pytest.mark.asyncio async def test_download_ad_use_existing_folder(self, extractor:AdExtractor) -> None: @@ -847,7 +847,7 @@ class TestAdExtractorDownload: patch("os.rename") as mock_rename, \ patch("shutil.rmtree") as mock_rmtree, \ patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \ - patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract: + patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir: base_dir = "downloaded-ads" temp_dir = os.path.join(base_dir, "ad_12345") @@ -859,24 +859,28 @@ class TestAdExtractorDownload: mock_exists.side_effect = lambda path: path in existing_paths mock_isdir.side_effect = lambda path: path == base_dir - mock_extract.return_value = AdPartial.model_validate({ - "title": "Test Advertisement Title", - "description": "Test Description", - "category": "Dienstleistungen", - "price": 100, - "images": [], - "contact": { - "name": "Test User", - "street": "Test Street 123", - "zipcode": "12345", - "location": "Test City" - } - }) + # Mock the new method that handles directory creation and extraction + mock_extract_with_dir.return_value = ( + AdPartial.model_validate({ + "title": "Test Advertisement Title", + "description": "Test Description", + "category": "Dienstleistungen", + "price": 100, + "images": [], + "contact": { + "name": "Test User", + "street": "Test Street 123", + "zipcode": "12345", + "location": "Test City" + } + }), + temp_dir # Use existing temp directory + ) await extractor.download_ad(12345) # Verify the correct functions were called - mock_extract.assert_called_once() + mock_extract_with_dir.assert_called_once() mock_rmtree.assert_not_called() # No directory to remove mock_mkdir.assert_not_called() # Base directory already exists mock_makedirs.assert_not_called() # Using mkdir instead @@ -887,7 +891,7 @@ class TestAdExtractorDownload: assert actual_call is not None actual_path = actual_call[0][0].replace("/", os.path.sep) assert actual_path == yaml_path - assert actual_call[0][1] == mock_extract.return_value.model_dump() + assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump() @pytest.mark.asyncio async def test_download_ad_rename_existing_folder_when_enabled(self, extractor:AdExtractor) -> None: @@ -902,7 +906,7 @@ class TestAdExtractorDownload: patch("os.rename") as mock_rename, \ patch("shutil.rmtree") as mock_rmtree, \ patch("kleinanzeigen_bot.extract.dicts.save_dict", autospec = True) as mock_save_dict, \ - patch.object(extractor, "_extract_ad_page_info", new_callable = AsyncMock) as mock_extract: + patch.object(extractor, "_extract_ad_page_info_with_directory_handling", new_callable = AsyncMock) as mock_extract_with_dir: base_dir = "downloaded-ads" temp_dir = os.path.join(base_dir, "ad_12345") @@ -915,32 +919,45 @@ class TestAdExtractorDownload: mock_exists.side_effect = lambda path: path in existing_paths mock_isdir.side_effect = lambda path: path == base_dir - mock_extract.return_value = AdPartial.model_validate({ - "title": "Test Advertisement Title", - "description": "Test Description", - "category": "Dienstleistungen", - "price": 100, - "images": [], - "contact": { - "name": "Test User", - "street": "Test Street 123", - "zipcode": "12345", - "location": "Test City" - } - }) + # Mock the new method that handles directory creation and extraction + mock_extract_with_dir.return_value = ( + AdPartial.model_validate({ + "title": "Test Advertisement Title", + "description": "Test Description", + "category": "Dienstleistungen", + "price": 100, + "images": [], + "contact": { + "name": "Test User", + "street": "Test Street 123", + "zipcode": "12345", + "location": "Test City" + } + }), + final_dir + ) await extractor.download_ad(12345) # Verify the correct functions were called - mock_extract.assert_called_once() - mock_rmtree.assert_not_called() # No directory to remove - mock_mkdir.assert_not_called() # Base directory already exists + mock_extract_with_dir.assert_called_once() # Extract to final directory + # Directory handling (including renaming) is now done inside _extract_ad_page_info_with_directory_handling + mock_rmtree.assert_not_called() # Directory handling is done internally + mock_mkdir.assert_not_called() # Directory handling is done internally mock_makedirs.assert_not_called() # Using mkdir instead - mock_rename.assert_called_once_with(temp_dir, final_dir) # Rename temp to final + mock_rename.assert_not_called() # Directory handling is done internally # Get the actual call arguments actual_call = mock_save_dict.call_args assert actual_call is not None actual_path = actual_call[0][0].replace("/", os.path.sep) assert actual_path == yaml_path - assert actual_call[0][1] == mock_extract.return_value.model_dump() + assert actual_call[0][1] == mock_extract_with_dir.return_value[0].model_dump() + + @pytest.mark.asyncio + # pylint: disable=protected-access + async def test_download_images_no_images(self, extractor:AdExtractor) -> None: + """Test image download when no images are found.""" + with patch.object(extractor, "web_find", new_callable = AsyncMock, side_effect = TimeoutError): + image_paths = await extractor._download_images_from_ad_page("/some/dir", 12345) + assert len(image_paths) == 0