ADD download --all feature (#139)

2026-03-12 10:31:50 +01:00 · 2022-11-22 18:52:32 +01:00
parent 23682edb5c
commit 70a23dbcc7
4 changed files with 231 additions and 52 deletions
--- a/README.md
+++ b/README.md
@@ -179,20 +179,25 @@ Usage: kleinanzeigen-bot COMMAND [OPTIONS]
 Commands:
  publish  - (re-)publishes ads
  verify   - verifies the configuration files
-  download - downloads an ad
+  delete   - deletes ads
  download - downloads one or multiple ads
  --
-  help    - displays this help (default command)
+  help     - displays this help (default command)
-  version - displays the application version
+  version  - displays the application version
 Options:
-  --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
+  --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
        Possible values:
        * all: (re-)publish all ads ignoring republication_interval
        * due: publish all new ads and republish ads according the republication_interval
        * new: only publish new ads (i.e. ads that have no id in the config file)
  --ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
        Possible values:
        * all: downloads all ads from your profile
        * new: downloads ads from your profile that are not locally saved yet
        * <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
  --force           - alias for '--ads=all'
  --keep-old        - don't delete old ads on republication
  --ad <ID>         - provide the ad ID after this option when using the download command
  --config=<PATH>   - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
  --logfile=<PATH>  - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
  -v, --verbose     - enables verbose output - only useful when troubleshooting issues
--- a/kleinanzeigen_bot/init.py
+++ b/kleinanzeigen_bot/init.py
@@ -3,20 +3,22 @@ Copyright (C) 2022 Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 """
 import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
 import re
 import shutil
 from collections.abc import Iterable
 from datetime import datetime
 from logging.handlers import RotatingFileHandler
 from typing import Any, Final
 from urllib import request
 from wcmatch import glob
 from overrides import overrides
 from ruamel.yaml import YAML
-from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
+from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, \
    ElementClickInterceptedException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.wait import WebDriverWait
 from . import utils, resources, extract  # pylint: disable=W0406
 from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime
@@ -52,7 +54,6 @@ class KleinanzeigenBot(SeleniumMixin):
        self.ads_selector = "due"
        self.delete_old_ads = True
        self.delete_ads_by_title = False
        self.ad_id = None  # attribute needed when downloading an ad
    def __del__(self) -> None:
        if self.file_log:
@@ -99,24 +100,16 @@ class KleinanzeigenBot(SeleniumMixin):
                    LOG.info("############################################")
            case "download":
                self.configure_file_logging()
-                # ad ID passed as value to download command
+                # ad IDs depends on selector
-                if self.ad_id is None:
+                if not (self.ads_selector in {'all', 'new'} or re.compile(r'\d+[,\d+]*').search(self.ads_selector)):
-                    LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!')
+                    LOG.warning('You provided no ads selector. Defaulting to "new".')
-                    sys.exit(2)
+                    self.ads_selector = 'new'
-                if self.ad_id < 1:
+                # start session
                    LOG.error('The given ad ID must be valid!')
                    sys.exit(2)
                LOG.info('Start fetch task for ad with ID %s', str(self.ad_id))
                self.load_config()
                self.create_webdriver_session()
                self.login()
-                # call download function
+                self.start_download_routine()  # call correct version of download
-                exists = self.navigate_to_ad_page()
+
                if exists:
                    self.download_ad_page()
                else:
                    sys.exit(2)
            case _:
                LOG.error("Unknown command: %s", self.command)
                sys.exit(2)
@@ -136,20 +129,24 @@ class KleinanzeigenBot(SeleniumMixin):
              publish  - (re-)publishes ads
              verify   - verifies the configuration files
              delete   - deletes ads
-              download - downloads an ad
+              download - downloads one or multiple ads
              --
-              help    - displays this help (default command)
+              help     - displays this help (default command)
-              version - displays the application version
+              version  - displays the application version
            Options:
-              --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
+              --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
                    Possible values:
                    * all: (re-)publish all ads ignoring republication_interval
                    * due: publish all new ads and republish ads according the republication_interval
                    * new: only publish new ads (i.e. ads that have no id in the config file)
              --ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
                    Possible values:
                    * all: downloads all ads from your profile
                    * new: downloads ads from your profile that are not locally saved yet
                    * <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
              --force           - alias for '--ads=all'
              --keep-old        - don't delete old ads on republication
              --ad <ID>         - provide the ad ID after this option when using the download command
              --config=<PATH>   - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
              --logfile=<PATH>  - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
              -v, --verbose     - enables verbose output - only useful when troubleshooting issues
@@ -163,7 +160,6 @@ class KleinanzeigenBot(SeleniumMixin):
                "force",
                "help",
                "keep-old",
                "ad=",
                "logfile=",
                "verbose"
            ])
@@ -190,12 +186,6 @@ class KleinanzeigenBot(SeleniumMixin):
                    self.ads_selector = "all"
                case "--keep-old":
                    self.delete_old_ads = False
                case "--ad":
                    try:
                        self.ad_id:int = int(value)
                    except ValueError:  # given value cannot be parsed as integer
                        LOG.error('The given ad ID (\"%s\") is not a valid number!', value)
                        sys.exit(2)
                case "-v" | "--verbose":
                    LOG.setLevel(logging.DEBUG)
@@ -663,16 +653,30 @@ class KleinanzeigenBot(SeleniumMixin):
                else:
                    raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex
-    def navigate_to_ad_page(self) -> bool:
+    def navigate_to_ad_page(self, id_:int | None = None, url:str | None = None) -> bool:
        """
-        Navigates to an ad page specified with an ad ID.
+        Navigates to an ad page specified with an ad ID; or alternatively by a given URL.
        :param id_: if provided (and no url given), the ID is used to search for the ad to navigate to
        :param url: if given, this URL is used instead of an id to find the ad page
        :return: whether the navigation to the ad page was successful
        """
-        # enter the ad ID into the search bar
+        if not (id_ or url):
-        self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id))
+            raise UserWarning('This function needs either the "id_" or "url" parameter given!')
-        # navigate to ad page and wait
+        if url:
-        self.web_click(By.XPATH, '//*[@id="site-search-submit"]')
+            self.webdriver.get(url)  # navigate to URL directly given
        else:
            # enter the ad ID into the search bar
            self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(id_))
            # navigate to ad page and wait
            submit_button = self.webdriver.find_element(By.XPATH, '//*[@id="site-search-submit"]')
            WebDriverWait(self.webdriver, 15).until(EC.element_to_be_clickable(submit_button))
            try:
                submit_button.click()
            except ElementClickInterceptedException:  # sometimes: special banner might pop up and intercept
                LOG.warning('Waiting for unexpected element to close...')
                pause(6000, 10000)
                submit_button.click()
        pause(1000, 2000)
        # handle the case that invalid ad ID given
@@ -686,7 +690,7 @@ class KleinanzeigenBot(SeleniumMixin):
            close_button.click()
            time.sleep(1)
        except NoSuchElementException:
-            print('(no popup given)')
+            print('(no popup)')
        return True
    def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
@@ -753,11 +757,12 @@ class KleinanzeigenBot(SeleniumMixin):
        return img_paths
-    def extract_ad_page_info(self, directory:str) -> dict:
+    def extract_ad_page_info(self, directory:str, id_:int) -> dict:
        """
        Extracts all necessary information from an ad´s page.
        :param directory: the path of the ad´s previously created directory
        :param id_: the ad ID, already extracted by a calling function
        :return: a dictionary with the keys as given in an ad YAML, and their respective values
        """
        info = {'active': True}
@@ -789,14 +794,15 @@ class KleinanzeigenBot(SeleniumMixin):
        info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()
        # fetch images
-        info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG)
+        info['images'] = self.download_images_from_ad_page(directory, id_, LOG)
        # process address
        info['contact'] = extractor.extract_contact_from_ad_page()
        # process meta info
        info['republication_interval'] = 7  # a default value for downloaded ads
-        info['id'] = self.ad_id
+        info['id'] = id_
        try:  # try different locations known for creation date element
            creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
                                                                  'div[1]/span').text
@@ -812,9 +818,12 @@ class KleinanzeigenBot(SeleniumMixin):
        return info
-    def download_ad_page(self):
+    def download_ad_page(self, id_:int):
        """
-        Downloads an ad to a specific location, specified by config and ad_id.
+        Downloads an ad to a specific location, specified by config and ad ID.
        NOTE: Requires that the driver session currently is on the ad page.
        :param id_: the ad ID
        """
        # create sub-directory for ad to download:
@@ -822,7 +831,8 @@ class KleinanzeigenBot(SeleniumMixin):
        # make sure configured base directory exists
        if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
            os.mkdir(relative_directory)
-        new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}')
+
        new_base_dir = os.path.join(relative_directory, f'ad_{id_}')
        if os.path.exists(new_base_dir):
            LOG.info('Deleting current folder of ad...')
            shutil.rmtree(new_base_dir)
@@ -830,10 +840,78 @@ class KleinanzeigenBot(SeleniumMixin):
        LOG.info('New directory for ad created at %s.', new_base_dir)
        # call extraction function
-        info = self.extract_ad_page_info(new_base_dir)
+        info = self.extract_ad_page_info(new_base_dir, id_)
-        ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml'
+        ad_file_path = new_base_dir + '/' + f'ad_{id_}.yaml'
        utils.save_dict(ad_file_path, info)
    def start_download_routine(self):
        """
        Determines which download mode was chosen with the arguments, and calls the specified download routine.
        This downloads either all, only unsaved (new), or specific ads given by ID.
        """
        # use relevant download routine
        if self.ads_selector in {'all', 'new'}:  # explore ads overview for these two modes
            LOG.info('Scanning your ad overview...')
            ext = extract.AdExtractor(self.webdriver)
            refs = ext.extract_own_ads_references()
            LOG.info('%d ads were found!', len(refs))
            if self.ads_selector == 'all':  # download all of your adds
                LOG.info('Start fetch task for all your ads!')
                success_count = 0
                # call download function for each ad page
                for ref in refs:
                    ref_ad_id: int = utils.extract_ad_id_from_ad_link(ref)
                    if self.navigate_to_ad_page(url=ref):
                        self.download_ad_page(ref_ad_id)
                        success_count += 1
                LOG.info("%d of %d ads were downloaded from your profile.", success_count, len(refs))
            elif self.ads_selector == 'new':  # download only unsaved ads
                # determine ad IDs from links
                ref_ad_ids = [utils.extract_ad_id_from_ad_link(r) for r in refs]
                ref_pairs = list(zip(refs, ref_ad_ids))
                # check which ads already saved
                saved_ad_ids = []
                data_root_dir = os.path.dirname(self.config_file_path)
                for file_pattern in self.config["ad_files"]:
                    for ad_file in glob.glob(file_pattern, root_dir=os.path.dirname(self.config_file_path),
                                             flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB):
                        ad_file_path = abspath(ad_file, relative_to=data_root_dir)
                        ad_dict = utils.load_dict(ad_file_path)
                        ad_id = int(ad_dict['id'])
                        saved_ad_ids.append(ad_id)
                LOG.info('Start fetch task for your unsaved ads!')
                new_count = 0
                for ref_pair in ref_pairs:
                    # check if ad with ID already saved
                    id_: int = ref_pair[1]
                    if id_ in saved_ad_ids:
                        LOG.info('The ad with id %d has already been saved.', id_)
                        continue
                    if self.navigate_to_ad_page(url=ref_pair[0]):
                        self.download_ad_page(id_)
                        new_count += 1
                LOG.info('%d new ads were downloaded from your profile.', new_count)
        elif re.compile(r'\d+[,\d+]*').search(self.ads_selector):  # download ad(s) with specific id(s)
            ids = [int(n) for n in self.ads_selector.split(',')]
            LOG.info('Start fetch task for the ad(s) with the id(s):')
            LOG.info(' | '.join([str(id_) for id_ in ids]))
            for id_ in ids:  # call download routine for every id
                exists = self.navigate_to_ad_page(id_)
                if exists:
                    self.download_ad_page(id_)
                    LOG.info('Downloaded ad with id %d', id_)
                else:
                    LOG.error('The page with the id %d does not exist!', id_)
 #############################
 # main entry point
--- a/kleinanzeigen_bot/extract.py
+++ b/kleinanzeigen_bot/extract.py
@@ -2,14 +2,16 @@
 Copyright (C) 2022 Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 """
 from decimal import DecimalException
 import json
 from decimal import DecimalException
 import selenium.webdriver.support.expected_conditions as EC
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.remote.webdriver import WebDriver
 from selenium.webdriver.support.wait import WebDriverWait
-from .utils import parse_decimal
+from .utils import parse_decimal, pause, smooth_scroll_page
 class AdExtractor:
@@ -147,3 +149,57 @@ class AdExtractor:
        # also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
        return contact
    def extract_own_ads_references(self) -> list[str]:
        """
        Extracts the references to all own ads.
        :return: the links to your ad pages
        """
        # navigate to your ads page
        self.driver.get('https://www.ebay-kleinanzeigen.de/m-meine-anzeigen.html')
        WebDriverWait(self.driver, 15).until(EC.url_contains('meine-anzeigen'))
        pause(2000, 3000)
        # collect ad references:
        pagination_section = self.driver.find_element(By.CSS_SELECTOR, 'section.jsx-1105488430:nth-child(4)')
        # scroll down to load dynamically
        smooth_scroll_page(self.driver)
        pause(2000, 3000)
        # detect multi-page
        try:
            pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div')  # Pagination
        except NoSuchElementException:  # 0 ads - no pagination area
            print('There currently seem to be no ads on your profile!')
            return []
        n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button'))
        multi_page:bool
        if n_buttons > 1:
            multi_page = True
            print('It seems like you have many ads!')
        else:
            multi_page = False
            print('It seems like all your ads fit on one overview page.')
        refs:list[str] = []
        while True:  # loop reference extraction until no more forward page
            # extract references
            list_section = self.driver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]')
            list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox')
            refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items]
            if not multi_page:  # only one iteration for single-page overview
                break
            # check if last page
            nav_button = self.driver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1]
            if nav_button.get_attribute('title') != 'Nächste':
                print('Last ad overview page explored.')
                break
            # navigate to next overview page
            nav_button.click()
            pause(2000, 3000)
            smooth_scroll_page(self.driver)
        return refs
--- a/kleinanzeigen_bot/utils.py
+++ b/kleinanzeigen_bot/utils.py
@@ -11,6 +11,7 @@ from typing import Any, Final, TypeVar
 import coloredlogs, inflect
 from ruamel.yaml import YAML
 from selenium.webdriver.chrome.webdriver import WebDriver
 LOG_ROOT:Final[logging.Logger] = logging.getLogger()
 LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils")
@@ -270,3 +271,42 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
    if isinstance(date, datetime):
        return date
    return datetime.fromisoformat(date)
 def smooth_scroll_page(driver: WebDriver, scroll_length: int = 10, scroll_speed: int = 10000, scroll_back_top: bool = False):
    """
    Scrolls the current page of a web driver session.
    :param driver: the web driver session
    :param scroll_length: the length of a single scroll iteration, determines smoothness of scrolling, lower is smoother
    :param scroll_speed: the speed of scrolling, higher is faster
    :param scroll_back_top: whether to scroll the page back to the top after scrolling to the bottom
    """
    current_y_pos = 0
    bottom_y_pos: int = driver.execute_script('return document.body.scrollHeight;')  # get bottom position by JS
    while current_y_pos < bottom_y_pos:  # scroll in steps until bottom reached
        current_y_pos += scroll_length
        driver.execute_script(f'window.scrollTo(0, {current_y_pos});')  # scroll one step
        time.sleep(scroll_length / scroll_speed)
    if scroll_back_top:  # scroll back to top in same style
        while current_y_pos > 0:
            current_y_pos -= scroll_length
            driver.execute_script(f'window.scrollTo(0, {current_y_pos});')
            time.sleep(scroll_length / scroll_speed / 2)  # double speed
 def extract_ad_id_from_ad_link(url: str) -> int:
    """
    Extracts the ID of an ad, given by its reference link.
    :param url: the URL to the ad page
    :return: the ad ID, a (ten-digit) integer number
    """
    num_part = url.split('/')[-1]  # suffix
    id_part = num_part.split('-')[0]
    try:
        return int(id_part)
    except ValueError:
        print('The ad ID could not be extracted from the given ad reference!')
        return -1