ADD download --all feature (#139)

2026-03-12 10:31:50 +01:00 · 2022-11-22 18:52:32 +01:00
parent 23682edb5c
commit 70a23dbcc7
4 changed files with 231 additions and 52 deletions
--- a/README.md
+++ b/README.md
@@ -179,20 +179,25 @@ Usage: kleinanzeigen-bot COMMAND [OPTIONS]
 Commands:
  publish  - (re-)publishes ads
  verify   - verifies the configuration files
-  download - downloads an ad
+  delete   - deletes ads
+  download - downloads one or multiple ads
  --
-  help    - displays this help (default command)
-  version - displays the application version
+  help     - displays this help (default command)
+  version  - displays the application version

 Options:
-  --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
+  --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
        Possible values:
        * all: (re-)publish all ads ignoring republication_interval
        * due: publish all new ads and republish ads according the republication_interval
        * new: only publish new ads (i.e. ads that have no id in the config file)
+  --ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
+        Possible values:
+        * all: downloads all ads from your profile
+        * new: downloads ads from your profile that are not locally saved yet
+        * <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
  --force           - alias for '--ads=all'
  --keep-old        - don't delete old ads on republication
-  --ad <ID>         - provide the ad ID after this option when using the download command
  --config=<PATH>   - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
  --logfile=<PATH>  - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
  -v, --verbose     - enables verbose output - only useful when troubleshooting issues
--- a/kleinanzeigen_bot/init.py
+++ b/kleinanzeigen_bot/init.py
@@ -3,20 +3,22 @@ Copyright (C) 2022 Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 """
 import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
+import re
 import shutil
 from collections.abc import Iterable
 from datetime import datetime
 from logging.handlers import RotatingFileHandler
 from typing import Any, Final
 from urllib import request
-
 from wcmatch import glob

 from overrides import overrides
 from ruamel.yaml import YAML
-from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
+from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, \
+    ElementClickInterceptedException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.wait import WebDriverWait

 from . import utils, resources, extract  # pylint: disable=W0406
 from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime
@@ -52,7 +54,6 @@ class KleinanzeigenBot(SeleniumMixin):
        self.ads_selector = "due"
        self.delete_old_ads = True
        self.delete_ads_by_title = False
-        self.ad_id = None  # attribute needed when downloading an ad

    def __del__(self) -> None:
        if self.file_log:
@@ -99,24 +100,16 @@ class KleinanzeigenBot(SeleniumMixin):
                    LOG.info("############################################")
            case "download":
                self.configure_file_logging()
-                # ad ID passed as value to download command
-                if self.ad_id is None:
-                    LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!')
-                    sys.exit(2)
-                if self.ad_id < 1:
-                    LOG.error('The given ad ID must be valid!')
-                    sys.exit(2)
-                LOG.info('Start fetch task for ad with ID %s', str(self.ad_id))
-
+                # ad IDs depends on selector
+                if not (self.ads_selector in {'all', 'new'} or re.compile(r'\d+[,\d+]*').search(self.ads_selector)):
+                    LOG.warning('You provided no ads selector. Defaulting to "new".')
+                    self.ads_selector = 'new'
+                # start session
                self.load_config()
                self.create_webdriver_session()
                self.login()
-                # call download function
-                exists = self.navigate_to_ad_page()
-                if exists:
-                    self.download_ad_page()
-                else:
-                    sys.exit(2)
+                self.start_download_routine()  # call correct version of download
+
            case _:
                LOG.error("Unknown command: %s", self.command)
                sys.exit(2)
@@ -136,20 +129,24 @@ class KleinanzeigenBot(SeleniumMixin):
              publish  - (re-)publishes ads
              verify   - verifies the configuration files
              delete   - deletes ads
-              download - downloads an ad
+              download - downloads one or multiple ads
              --
-              help    - displays this help (default command)
-              version - displays the application version
+              help     - displays this help (default command)
+              version  - displays the application version

            Options:
-              --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
+              --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
                    Possible values:
                    * all: (re-)publish all ads ignoring republication_interval
                    * due: publish all new ads and republish ads according the republication_interval
                    * new: only publish new ads (i.e. ads that have no id in the config file)
+              --ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
+                    Possible values:
+                    * all: downloads all ads from your profile
+                    * new: downloads ads from your profile that are not locally saved yet
+                    * <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
              --force           - alias for '--ads=all'
              --keep-old        - don't delete old ads on republication
-              --ad <ID>         - provide the ad ID after this option when using the download command
              --config=<PATH>   - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
              --logfile=<PATH>  - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
              -v, --verbose     - enables verbose output - only useful when troubleshooting issues
@@ -163,7 +160,6 @@ class KleinanzeigenBot(SeleniumMixin):
                "force",
                "help",
                "keep-old",
-                "ad=",
                "logfile=",
                "verbose"
            ])
@@ -190,12 +186,6 @@ class KleinanzeigenBot(SeleniumMixin):
                    self.ads_selector = "all"
                case "--keep-old":
                    self.delete_old_ads = False
-                case "--ad":
-                    try:
-                        self.ad_id:int = int(value)
-                    except ValueError:  # given value cannot be parsed as integer
-                        LOG.error('The given ad ID (\"%s\") is not a valid number!', value)
-                        sys.exit(2)
                case "-v" | "--verbose":
                    LOG.setLevel(logging.DEBUG)

@@ -663,16 +653,30 @@ class KleinanzeigenBot(SeleniumMixin):
                else:
                    raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex

-    def navigate_to_ad_page(self) -> bool:
+    def navigate_to_ad_page(self, id_:int | None = None, url:str | None = None) -> bool:
        """
-        Navigates to an ad page specified with an ad ID.
+        Navigates to an ad page specified with an ad ID; or alternatively by a given URL.

+        :param id_: if provided (and no url given), the ID is used to search for the ad to navigate to
+        :param url: if given, this URL is used instead of an id to find the ad page
        :return: whether the navigation to the ad page was successful
        """
-        # enter the ad ID into the search bar
-        self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id))
-        # navigate to ad page and wait
-        self.web_click(By.XPATH, '//*[@id="site-search-submit"]')
+        if not (id_ or url):
+            raise UserWarning('This function needs either the "id_" or "url" parameter given!')
+        if url:
+            self.webdriver.get(url)  # navigate to URL directly given
+        else:
+            # enter the ad ID into the search bar
+            self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(id_))
+            # navigate to ad page and wait
+            submit_button = self.webdriver.find_element(By.XPATH, '//*[@id="site-search-submit"]')
+            WebDriverWait(self.webdriver, 15).until(EC.element_to_be_clickable(submit_button))
+            try:
+                submit_button.click()
+            except ElementClickInterceptedException:  # sometimes: special banner might pop up and intercept
+                LOG.warning('Waiting for unexpected element to close...')
+                pause(6000, 10000)
+                submit_button.click()
        pause(1000, 2000)

        # handle the case that invalid ad ID given
@@ -686,7 +690,7 @@ class KleinanzeigenBot(SeleniumMixin):
            close_button.click()
            time.sleep(1)
        except NoSuchElementException:
-            print('(no popup given)')
+            print('(no popup)')
        return True

    def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
@@ -753,11 +757,12 @@ class KleinanzeigenBot(SeleniumMixin):

        return img_paths

-    def extract_ad_page_info(self, directory:str) -> dict:
+    def extract_ad_page_info(self, directory:str, id_:int) -> dict:
        """
        Extracts all necessary information from an ad´s page.

        :param directory: the path of the ad´s previously created directory
+        :param id_: the ad ID, already extracted by a calling function
        :return: a dictionary with the keys as given in an ad YAML, and their respective values
        """
        info = {'active': True}
@@ -789,14 +794,15 @@ class KleinanzeigenBot(SeleniumMixin):
        info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()

        # fetch images
-        info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG)
+        info['images'] = self.download_images_from_ad_page(directory, id_, LOG)

        # process address
        info['contact'] = extractor.extract_contact_from_ad_page()

        # process meta info
        info['republication_interval'] = 7  # a default value for downloaded ads
-        info['id'] = self.ad_id
+        info['id'] = id_
+
        try:  # try different locations known for creation date element
            creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
                                                                  'div[1]/span').text
@@ -812,9 +818,12 @@ class KleinanzeigenBot(SeleniumMixin):

        return info

-    def download_ad_page(self):
+    def download_ad_page(self, id_:int):
        """
-        Downloads an ad to a specific location, specified by config and ad_id.
+        Downloads an ad to a specific location, specified by config and ad ID.
+        NOTE: Requires that the driver session currently is on the ad page.
+
+        :param id_: the ad ID
        """

        # create sub-directory for ad to download:
@@ -822,7 +831,8 @@ class KleinanzeigenBot(SeleniumMixin):
        # make sure configured base directory exists
        if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
            os.mkdir(relative_directory)
-        new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}')
+
+        new_base_dir = os.path.join(relative_directory, f'ad_{id_}')
        if os.path.exists(new_base_dir):
            LOG.info('Deleting current folder of ad...')
            shutil.rmtree(new_base_dir)
@@ -830,10 +840,78 @@ class KleinanzeigenBot(SeleniumMixin):
        LOG.info('New directory for ad created at %s.', new_base_dir)

        # call extraction function
-        info = self.extract_ad_page_info(new_base_dir)
-        ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml'
+        info = self.extract_ad_page_info(new_base_dir, id_)
+        ad_file_path = new_base_dir + '/' + f'ad_{id_}.yaml'
        utils.save_dict(ad_file_path, info)

+    def start_download_routine(self):
+        """
+        Determines which download mode was chosen with the arguments, and calls the specified download routine.
+        This downloads either all, only unsaved (new), or specific ads given by ID.
+        """
+
+        # use relevant download routine
+        if self.ads_selector in {'all', 'new'}:  # explore ads overview for these two modes
+            LOG.info('Scanning your ad overview...')
+            ext = extract.AdExtractor(self.webdriver)
+            refs = ext.extract_own_ads_references()
+            LOG.info('%d ads were found!', len(refs))
+
+            if self.ads_selector == 'all':  # download all of your adds
+                LOG.info('Start fetch task for all your ads!')
+
+                success_count = 0
+                # call download function for each ad page
+                for ref in refs:
+                    ref_ad_id: int = utils.extract_ad_id_from_ad_link(ref)
+                    if self.navigate_to_ad_page(url=ref):
+                        self.download_ad_page(ref_ad_id)
+                        success_count += 1
+                LOG.info("%d of %d ads were downloaded from your profile.", success_count, len(refs))
+
+            elif self.ads_selector == 'new':  # download only unsaved ads
+                # determine ad IDs from links
+                ref_ad_ids = [utils.extract_ad_id_from_ad_link(r) for r in refs]
+                ref_pairs = list(zip(refs, ref_ad_ids))
+
+                # check which ads already saved
+                saved_ad_ids = []
+                data_root_dir = os.path.dirname(self.config_file_path)
+                for file_pattern in self.config["ad_files"]:
+                    for ad_file in glob.glob(file_pattern, root_dir=os.path.dirname(self.config_file_path),
+                                             flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB):
+                        ad_file_path = abspath(ad_file, relative_to=data_root_dir)
+                        ad_dict = utils.load_dict(ad_file_path)
+                        ad_id = int(ad_dict['id'])
+                        saved_ad_ids.append(ad_id)
+
+                LOG.info('Start fetch task for your unsaved ads!')
+                new_count = 0
+                for ref_pair in ref_pairs:
+                    # check if ad with ID already saved
+                    id_: int = ref_pair[1]
+                    if id_ in saved_ad_ids:
+                        LOG.info('The ad with id %d has already been saved.', id_)
+                        continue
+
+                    if self.navigate_to_ad_page(url=ref_pair[0]):
+                        self.download_ad_page(id_)
+                        new_count += 1
+                LOG.info('%d new ads were downloaded from your profile.', new_count)
+
+        elif re.compile(r'\d+[,\d+]*').search(self.ads_selector):  # download ad(s) with specific id(s)
+            ids = [int(n) for n in self.ads_selector.split(',')]
+            LOG.info('Start fetch task for the ad(s) with the id(s):')
+            LOG.info(' | '.join([str(id_) for id_ in ids]))
+
+            for id_ in ids:  # call download routine for every id
+                exists = self.navigate_to_ad_page(id_)
+                if exists:
+                    self.download_ad_page(id_)
+                    LOG.info('Downloaded ad with id %d', id_)
+                else:
+                    LOG.error('The page with the id %d does not exist!', id_)
+

 #############################
 # main entry point
--- a/kleinanzeigen_bot/extract.py
+++ b/kleinanzeigen_bot/extract.py
@@ -2,14 +2,16 @@
 Copyright (C) 2022 Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 """
-from decimal import DecimalException
 import json
+from decimal import DecimalException

+import selenium.webdriver.support.expected_conditions as EC
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.support.wait import WebDriverWait

-from .utils import parse_decimal
+from .utils import parse_decimal, pause, smooth_scroll_page


 class AdExtractor:
@@ -147,3 +149,57 @@ class AdExtractor:
        # also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/

        return contact
+
+    def extract_own_ads_references(self) -> list[str]:
+        """
+        Extracts the references to all own ads.
+
+        :return: the links to your ad pages
+        """
+        # navigate to your ads page
+        self.driver.get('https://www.ebay-kleinanzeigen.de/m-meine-anzeigen.html')
+        WebDriverWait(self.driver, 15).until(EC.url_contains('meine-anzeigen'))
+        pause(2000, 3000)
+
+        # collect ad references:
+
+        pagination_section = self.driver.find_element(By.CSS_SELECTOR, 'section.jsx-1105488430:nth-child(4)')
+        # scroll down to load dynamically
+        smooth_scroll_page(self.driver)
+        pause(2000, 3000)
+        # detect multi-page
+        try:
+            pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div')  # Pagination
+        except NoSuchElementException:  # 0 ads - no pagination area
+            print('There currently seem to be no ads on your profile!')
+            return []
+
+        n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button'))
+        multi_page:bool
+        if n_buttons > 1:
+            multi_page = True
+            print('It seems like you have many ads!')
+        else:
+            multi_page = False
+            print('It seems like all your ads fit on one overview page.')
+
+        refs:list[str] = []
+        while True:  # loop reference extraction until no more forward page
+            # extract references
+            list_section = self.driver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]')
+            list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox')
+            refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items]
+
+            if not multi_page:  # only one iteration for single-page overview
+                break
+            # check if last page
+            nav_button = self.driver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1]
+            if nav_button.get_attribute('title') != 'Nächste':
+                print('Last ad overview page explored.')
+                break
+            # navigate to next overview page
+            nav_button.click()
+            pause(2000, 3000)
+            smooth_scroll_page(self.driver)
+
+        return refs
--- a/kleinanzeigen_bot/utils.py
+++ b/kleinanzeigen_bot/utils.py
@@ -11,6 +11,7 @@ from typing import Any, Final, TypeVar

 import coloredlogs, inflect
 from ruamel.yaml import YAML
+from selenium.webdriver.chrome.webdriver import WebDriver

 LOG_ROOT:Final[logging.Logger] = logging.getLogger()
 LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils")
@@ -270,3 +271,42 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
    if isinstance(date, datetime):
        return date
    return datetime.fromisoformat(date)
+
+
+def smooth_scroll_page(driver: WebDriver, scroll_length: int = 10, scroll_speed: int = 10000, scroll_back_top: bool = False):
+    """
+    Scrolls the current page of a web driver session.
+    :param driver: the web driver session
+    :param scroll_length: the length of a single scroll iteration, determines smoothness of scrolling, lower is smoother
+    :param scroll_speed: the speed of scrolling, higher is faster
+    :param scroll_back_top: whether to scroll the page back to the top after scrolling to the bottom
+    """
+    current_y_pos = 0
+    bottom_y_pos: int = driver.execute_script('return document.body.scrollHeight;')  # get bottom position by JS
+    while current_y_pos < bottom_y_pos:  # scroll in steps until bottom reached
+        current_y_pos += scroll_length
+        driver.execute_script(f'window.scrollTo(0, {current_y_pos});')  # scroll one step
+        time.sleep(scroll_length / scroll_speed)
+
+    if scroll_back_top:  # scroll back to top in same style
+        while current_y_pos > 0:
+            current_y_pos -= scroll_length
+            driver.execute_script(f'window.scrollTo(0, {current_y_pos});')
+            time.sleep(scroll_length / scroll_speed / 2)  # double speed
+
+
+def extract_ad_id_from_ad_link(url: str) -> int:
+    """
+    Extracts the ID of an ad, given by its reference link.
+
+    :param url: the URL to the ad page
+    :return: the ad ID, a (ten-digit) integer number
+    """
+    num_part = url.split('/')[-1]  # suffix
+    id_part = num_part.split('-')[0]
+
+    try:
+        return int(id_part)
+    except ValueError:
+        print('The ad ID could not be extracted from the given ad reference!')
+        return -1