Refactored images extraction. Now directly using galleryimage-elements instead of carousel.
This commit is contained in:
Heavenfighter
2025-05-25 22:28:20 +02:00
committed by GitHub
parent b17b19db24
commit 347c67a388
2 changed files with 4 additions and 15 deletions

View File

@@ -75,15 +75,15 @@ class AdExtractor(WebScrapingMixin):
# download all images from box # download all images from box
image_box = await self.web_find(By.CLASS_NAME, "galleryimage-large") image_box = await self.web_find(By.CLASS_NAME, "galleryimage-large")
n_images = len(await self.web_find_all(By.CSS_SELECTOR, ".galleryimage-element[data-ix]", parent = image_box)) images = await self.web_find_all(By.CSS_SELECTOR, ".galleryimage-element[data-ix] > img", parent = image_box)
n_images = len(images)
LOG.info("Found %s.", i18n.pluralize("image", n_images)) LOG.info("Found %s.", i18n.pluralize("image", n_images))
img_element:Element = await self.web_find(By.CSS_SELECTOR, "div:nth-child(1) > img", parent = image_box)
img_fn_prefix = "ad_" + str(ad_id) + "__img" img_fn_prefix = "ad_" + str(ad_id) + "__img"
img_nr = 1 img_nr = 1
dl_counter = 0 dl_counter = 0
while img_nr <= n_images: # scrolling + downloading
for img_element in images:
current_img_url = img_element.attrs["src"] # URL of the image current_img_url = img_element.attrs["src"] # URL of the image
if current_img_url is None: if current_img_url is None:
continue continue
@@ -97,16 +97,6 @@ class AdExtractor(WebScrapingMixin):
dl_counter += 1 dl_counter += 1
img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1]) img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1])
# navigate to next image (if exists)
if img_nr < n_images:
try:
# click next button, wait, and re-establish reference
await (await self.web_find(By.CLASS_NAME, "galleryimage--navigation--next")).click()
new_div = await self.web_find(By.CSS_SELECTOR, f"div.galleryimage-element:nth-child({img_nr + 1})")
img_element = await self.web_find(By.TAG_NAME, "img", parent = new_div)
except TimeoutError:
LOG.error("NEXT button in image gallery somehow missing, aborting image fetching.")
break
img_nr += 1 img_nr += 1
LOG.info("Downloaded %s.", i18n.pluralize("image", dl_counter)) LOG.info("Downloaded %s.", i18n.pluralize("image", dl_counter))

View File

@@ -165,7 +165,6 @@ kleinanzeigen_bot/extract.py:
_download_images_from_ad_page: _download_images_from_ad_page:
"Found %s.": "%s gefunden." "Found %s.": "%s gefunden."
"Downloaded %s.": "%s heruntergeladen." "Downloaded %s.": "%s heruntergeladen."
"NEXT button in image gallery somehow missing, aborting image fetching.": "WEITER-Button in der Bildergalerie fehlt, breche Bildabruf ab."
"No image area found. Continuing without downloading images.": "Keine Bildbereiche gefunden. Fahre ohne Bilder-Download fort." "No image area found. Continuing without downloading images.": "Keine Bildbereiche gefunden. Fahre ohne Bilder-Download fort."
extract_ad_id_from_ad_url: extract_ad_id_from_ad_url: