mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
fix: downloaded images have wrong file extension #348
This commit is contained in:
@@ -5,6 +5,7 @@ SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanze
|
|||||||
"""
|
"""
|
||||||
import logging, os, shutil
|
import logging, os, shutil
|
||||||
import urllib.request as urllib_request
|
import urllib.request as urllib_request
|
||||||
|
import mimetypes
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Final
|
from typing import Any, Final
|
||||||
|
|
||||||
@@ -77,12 +78,15 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
current_img_url = img_element.attrs['src'] # URL of the image
|
current_img_url = img_element.attrs['src'] # URL of the image
|
||||||
if current_img_url is None:
|
if current_img_url is None:
|
||||||
continue
|
continue
|
||||||
file_ending = current_img_url.split('.')[-1].lower()
|
|
||||||
img_path = directory + '/' + img_fn_prefix + str(img_nr) + '.' + file_ending
|
with urllib_request.urlopen(current_img_url) as response: # nosec B310
|
||||||
if current_img_url.startswith('https'): # verify https (for Bandit linter)
|
content_type = response.info().get_content_type()
|
||||||
urllib_request.urlretrieve(current_img_url, img_path) # nosec B310
|
file_ending = mimetypes.guess_extension(content_type)
|
||||||
dl_counter += 1
|
img_path = f"{directory}/{img_fn_prefix}{img_nr}{file_ending}"
|
||||||
img_paths.append(img_path.split('/')[-1])
|
with open(img_path, 'wb') as f:
|
||||||
|
shutil.copyfileobj(response, f)
|
||||||
|
dl_counter += 1
|
||||||
|
img_paths.append(img_path.rsplit('/', maxsplit = 1)[-1])
|
||||||
|
|
||||||
# navigate to next image (if exists)
|
# navigate to next image (if exists)
|
||||||
if img_nr < n_images:
|
if img_nr < n_images:
|
||||||
|
|||||||
Reference in New Issue
Block a user