diff --git a/README.md b/README.md index af18435..7283902 100644 --- a/README.md +++ b/README.md @@ -179,20 +179,25 @@ Usage: kleinanzeigen-bot COMMAND [OPTIONS] Commands: publish - (re-)publishes ads verify - verifies the configuration files - download - downloads an ad + delete - deletes ads + download - downloads one or multiple ads -- - help - displays this help (default command) - version - displays the application version + help - displays this help (default command) + version - displays the application version Options: - --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due) + --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due) Possible values: * all: (re-)publish all ads ignoring republication_interval * due: publish all new ads and republish ads according the republication_interval * new: only publish new ads (i.e. ads that have no id in the config file) + --ads=all|new| (download) - specifies which ads to download (DEFAULT: new) + Possible values: + * all: downloads all ads from your profile + * new: downloads ads from your profile that are not locally saved yet + * : provide one or several ads by ID to download, like e.g. "--ads=1,2,3" --force - alias for '--ads=all' --keep-old - don't delete old ads on republication - --ad - provide the ad ID after this option when using the download command --config= - path to the config YAML or JSON file (DEFAULT: ./config.yaml) --logfile= - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log) -v, --verbose - enables verbose output - only useful when troubleshooting issues diff --git a/kleinanzeigen_bot/__init__.py b/kleinanzeigen_bot/__init__.py index 932bfda..51f86ac 100644 --- a/kleinanzeigen_bot/__init__.py +++ b/kleinanzeigen_bot/__init__.py @@ -3,20 +3,22 @@ Copyright (C) 2022 Sebastian Thomschke and contributors SPDX-License-Identifier: AGPL-3.0-or-later """ import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib +import re import shutil from collections.abc import Iterable from datetime import datetime from logging.handlers import RotatingFileHandler from typing import Any, Final from urllib import request - from wcmatch import glob from overrides import overrides from ruamel.yaml import YAML -from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException +from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, \ + ElementClickInterceptedException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.wait import WebDriverWait from . import utils, resources, extract # pylint: disable=W0406 from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime @@ -52,7 +54,6 @@ class KleinanzeigenBot(SeleniumMixin): self.ads_selector = "due" self.delete_old_ads = True self.delete_ads_by_title = False - self.ad_id = None # attribute needed when downloading an ad def __del__(self) -> None: if self.file_log: @@ -99,24 +100,16 @@ class KleinanzeigenBot(SeleniumMixin): LOG.info("############################################") case "download": self.configure_file_logging() - # ad ID passed as value to download command - if self.ad_id is None: - LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!') - sys.exit(2) - if self.ad_id < 1: - LOG.error('The given ad ID must be valid!') - sys.exit(2) - LOG.info('Start fetch task for ad with ID %s', str(self.ad_id)) - + # ad IDs depends on selector + if not (self.ads_selector in {'all', 'new'} or re.compile(r'\d+[,\d+]*').search(self.ads_selector)): + LOG.warning('You provided no ads selector. Defaulting to "new".') + self.ads_selector = 'new' + # start session self.load_config() self.create_webdriver_session() self.login() - # call download function - exists = self.navigate_to_ad_page() - if exists: - self.download_ad_page() - else: - sys.exit(2) + self.start_download_routine() # call correct version of download + case _: LOG.error("Unknown command: %s", self.command) sys.exit(2) @@ -136,20 +129,24 @@ class KleinanzeigenBot(SeleniumMixin): publish - (re-)publishes ads verify - verifies the configuration files delete - deletes ads - download - downloads an ad + download - downloads one or multiple ads -- - help - displays this help (default command) - version - displays the application version + help - displays this help (default command) + version - displays the application version Options: - --ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due) + --ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due) Possible values: * all: (re-)publish all ads ignoring republication_interval * due: publish all new ads and republish ads according the republication_interval * new: only publish new ads (i.e. ads that have no id in the config file) + --ads=all|new| (download) - specifies which ads to download (DEFAULT: new) + Possible values: + * all: downloads all ads from your profile + * new: downloads ads from your profile that are not locally saved yet + * : provide one or several ads by ID to download, like e.g. "--ads=1,2,3" --force - alias for '--ads=all' --keep-old - don't delete old ads on republication - --ad - provide the ad ID after this option when using the download command --config= - path to the config YAML or JSON file (DEFAULT: ./config.yaml) --logfile= - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log) -v, --verbose - enables verbose output - only useful when troubleshooting issues @@ -163,7 +160,6 @@ class KleinanzeigenBot(SeleniumMixin): "force", "help", "keep-old", - "ad=", "logfile=", "verbose" ]) @@ -190,12 +186,6 @@ class KleinanzeigenBot(SeleniumMixin): self.ads_selector = "all" case "--keep-old": self.delete_old_ads = False - case "--ad": - try: - self.ad_id:int = int(value) - except ValueError: # given value cannot be parsed as integer - LOG.error('The given ad ID (\"%s\") is not a valid number!', value) - sys.exit(2) case "-v" | "--verbose": LOG.setLevel(logging.DEBUG) @@ -663,16 +653,30 @@ class KleinanzeigenBot(SeleniumMixin): else: raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex - def navigate_to_ad_page(self) -> bool: + def navigate_to_ad_page(self, id_:int | None = None, url:str | None = None) -> bool: """ - Navigates to an ad page specified with an ad ID. + Navigates to an ad page specified with an ad ID; or alternatively by a given URL. + :param id_: if provided (and no url given), the ID is used to search for the ad to navigate to + :param url: if given, this URL is used instead of an id to find the ad page :return: whether the navigation to the ad page was successful """ - # enter the ad ID into the search bar - self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id)) - # navigate to ad page and wait - self.web_click(By.XPATH, '//*[@id="site-search-submit"]') + if not (id_ or url): + raise UserWarning('This function needs either the "id_" or "url" parameter given!') + if url: + self.webdriver.get(url) # navigate to URL directly given + else: + # enter the ad ID into the search bar + self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(id_)) + # navigate to ad page and wait + submit_button = self.webdriver.find_element(By.XPATH, '//*[@id="site-search-submit"]') + WebDriverWait(self.webdriver, 15).until(EC.element_to_be_clickable(submit_button)) + try: + submit_button.click() + except ElementClickInterceptedException: # sometimes: special banner might pop up and intercept + LOG.warning('Waiting for unexpected element to close...') + pause(6000, 10000) + submit_button.click() pause(1000, 2000) # handle the case that invalid ad ID given @@ -686,7 +690,7 @@ class KleinanzeigenBot(SeleniumMixin): close_button.click() time.sleep(1) except NoSuchElementException: - print('(no popup given)') + print('(no popup)') return True def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]: @@ -753,11 +757,12 @@ class KleinanzeigenBot(SeleniumMixin): return img_paths - def extract_ad_page_info(self, directory:str) -> dict: + def extract_ad_page_info(self, directory:str, id_:int) -> dict: """ Extracts all necessary information from an ad´s page. :param directory: the path of the ad´s previously created directory + :param id_: the ad ID, already extracted by a calling function :return: a dictionary with the keys as given in an ad YAML, and their respective values """ info = {'active': True} @@ -789,14 +794,15 @@ class KleinanzeigenBot(SeleniumMixin): info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page() # fetch images - info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG) + info['images'] = self.download_images_from_ad_page(directory, id_, LOG) # process address info['contact'] = extractor.extract_contact_from_ad_page() # process meta info info['republication_interval'] = 7 # a default value for downloaded ads - info['id'] = self.ad_id + info['id'] = id_ + try: # try different locations known for creation date element creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/' 'div[1]/span').text @@ -812,9 +818,12 @@ class KleinanzeigenBot(SeleniumMixin): return info - def download_ad_page(self): + def download_ad_page(self, id_:int): """ - Downloads an ad to a specific location, specified by config and ad_id. + Downloads an ad to a specific location, specified by config and ad ID. + NOTE: Requires that the driver session currently is on the ad page. + + :param id_: the ad ID """ # create sub-directory for ad to download: @@ -822,7 +831,8 @@ class KleinanzeigenBot(SeleniumMixin): # make sure configured base directory exists if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory): os.mkdir(relative_directory) - new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}') + + new_base_dir = os.path.join(relative_directory, f'ad_{id_}') if os.path.exists(new_base_dir): LOG.info('Deleting current folder of ad...') shutil.rmtree(new_base_dir) @@ -830,10 +840,78 @@ class KleinanzeigenBot(SeleniumMixin): LOG.info('New directory for ad created at %s.', new_base_dir) # call extraction function - info = self.extract_ad_page_info(new_base_dir) - ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml' + info = self.extract_ad_page_info(new_base_dir, id_) + ad_file_path = new_base_dir + '/' + f'ad_{id_}.yaml' utils.save_dict(ad_file_path, info) + def start_download_routine(self): + """ + Determines which download mode was chosen with the arguments, and calls the specified download routine. + This downloads either all, only unsaved (new), or specific ads given by ID. + """ + + # use relevant download routine + if self.ads_selector in {'all', 'new'}: # explore ads overview for these two modes + LOG.info('Scanning your ad overview...') + ext = extract.AdExtractor(self.webdriver) + refs = ext.extract_own_ads_references() + LOG.info('%d ads were found!', len(refs)) + + if self.ads_selector == 'all': # download all of your adds + LOG.info('Start fetch task for all your ads!') + + success_count = 0 + # call download function for each ad page + for ref in refs: + ref_ad_id: int = utils.extract_ad_id_from_ad_link(ref) + if self.navigate_to_ad_page(url=ref): + self.download_ad_page(ref_ad_id) + success_count += 1 + LOG.info("%d of %d ads were downloaded from your profile.", success_count, len(refs)) + + elif self.ads_selector == 'new': # download only unsaved ads + # determine ad IDs from links + ref_ad_ids = [utils.extract_ad_id_from_ad_link(r) for r in refs] + ref_pairs = list(zip(refs, ref_ad_ids)) + + # check which ads already saved + saved_ad_ids = [] + data_root_dir = os.path.dirname(self.config_file_path) + for file_pattern in self.config["ad_files"]: + for ad_file in glob.glob(file_pattern, root_dir=os.path.dirname(self.config_file_path), + flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB): + ad_file_path = abspath(ad_file, relative_to=data_root_dir) + ad_dict = utils.load_dict(ad_file_path) + ad_id = int(ad_dict['id']) + saved_ad_ids.append(ad_id) + + LOG.info('Start fetch task for your unsaved ads!') + new_count = 0 + for ref_pair in ref_pairs: + # check if ad with ID already saved + id_: int = ref_pair[1] + if id_ in saved_ad_ids: + LOG.info('The ad with id %d has already been saved.', id_) + continue + + if self.navigate_to_ad_page(url=ref_pair[0]): + self.download_ad_page(id_) + new_count += 1 + LOG.info('%d new ads were downloaded from your profile.', new_count) + + elif re.compile(r'\d+[,\d+]*').search(self.ads_selector): # download ad(s) with specific id(s) + ids = [int(n) for n in self.ads_selector.split(',')] + LOG.info('Start fetch task for the ad(s) with the id(s):') + LOG.info(' | '.join([str(id_) for id_ in ids])) + + for id_ in ids: # call download routine for every id + exists = self.navigate_to_ad_page(id_) + if exists: + self.download_ad_page(id_) + LOG.info('Downloaded ad with id %d', id_) + else: + LOG.error('The page with the id %d does not exist!', id_) + ############################# # main entry point diff --git a/kleinanzeigen_bot/extract.py b/kleinanzeigen_bot/extract.py index 376d680..8d4eb82 100644 --- a/kleinanzeigen_bot/extract.py +++ b/kleinanzeigen_bot/extract.py @@ -2,14 +2,16 @@ Copyright (C) 2022 Sebastian Thomschke and contributors SPDX-License-Identifier: AGPL-3.0-or-later """ -from decimal import DecimalException import json +from decimal import DecimalException +import selenium.webdriver.support.expected_conditions as EC from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.remote.webdriver import WebDriver +from selenium.webdriver.support.wait import WebDriverWait -from .utils import parse_decimal +from .utils import parse_decimal, pause, smooth_scroll_page class AdExtractor: @@ -147,3 +149,57 @@ class AdExtractor: # also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/ return contact + + def extract_own_ads_references(self) -> list[str]: + """ + Extracts the references to all own ads. + + :return: the links to your ad pages + """ + # navigate to your ads page + self.driver.get('https://www.ebay-kleinanzeigen.de/m-meine-anzeigen.html') + WebDriverWait(self.driver, 15).until(EC.url_contains('meine-anzeigen')) + pause(2000, 3000) + + # collect ad references: + + pagination_section = self.driver.find_element(By.CSS_SELECTOR, 'section.jsx-1105488430:nth-child(4)') + # scroll down to load dynamically + smooth_scroll_page(self.driver) + pause(2000, 3000) + # detect multi-page + try: + pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div') # Pagination + except NoSuchElementException: # 0 ads - no pagination area + print('There currently seem to be no ads on your profile!') + return [] + + n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button')) + multi_page:bool + if n_buttons > 1: + multi_page = True + print('It seems like you have many ads!') + else: + multi_page = False + print('It seems like all your ads fit on one overview page.') + + refs:list[str] = [] + while True: # loop reference extraction until no more forward page + # extract references + list_section = self.driver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]') + list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox') + refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items] + + if not multi_page: # only one iteration for single-page overview + break + # check if last page + nav_button = self.driver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1] + if nav_button.get_attribute('title') != 'Nächste': + print('Last ad overview page explored.') + break + # navigate to next overview page + nav_button.click() + pause(2000, 3000) + smooth_scroll_page(self.driver) + + return refs diff --git a/kleinanzeigen_bot/utils.py b/kleinanzeigen_bot/utils.py index f0b4d01..893d44d 100644 --- a/kleinanzeigen_bot/utils.py +++ b/kleinanzeigen_bot/utils.py @@ -11,6 +11,7 @@ from typing import Any, Final, TypeVar import coloredlogs, inflect from ruamel.yaml import YAML +from selenium.webdriver.chrome.webdriver import WebDriver LOG_ROOT:Final[logging.Logger] = logging.getLogger() LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils") @@ -270,3 +271,42 @@ def parse_datetime(date:datetime | str | None) -> datetime | None: if isinstance(date, datetime): return date return datetime.fromisoformat(date) + + +def smooth_scroll_page(driver: WebDriver, scroll_length: int = 10, scroll_speed: int = 10000, scroll_back_top: bool = False): + """ + Scrolls the current page of a web driver session. + :param driver: the web driver session + :param scroll_length: the length of a single scroll iteration, determines smoothness of scrolling, lower is smoother + :param scroll_speed: the speed of scrolling, higher is faster + :param scroll_back_top: whether to scroll the page back to the top after scrolling to the bottom + """ + current_y_pos = 0 + bottom_y_pos: int = driver.execute_script('return document.body.scrollHeight;') # get bottom position by JS + while current_y_pos < bottom_y_pos: # scroll in steps until bottom reached + current_y_pos += scroll_length + driver.execute_script(f'window.scrollTo(0, {current_y_pos});') # scroll one step + time.sleep(scroll_length / scroll_speed) + + if scroll_back_top: # scroll back to top in same style + while current_y_pos > 0: + current_y_pos -= scroll_length + driver.execute_script(f'window.scrollTo(0, {current_y_pos});') + time.sleep(scroll_length / scroll_speed / 2) # double speed + + +def extract_ad_id_from_ad_link(url: str) -> int: + """ + Extracts the ID of an ad, given by its reference link. + + :param url: the URL to the ad page + :return: the ad ID, a (ten-digit) integer number + """ + num_part = url.split('/')[-1] # suffix + id_part = num_part.split('-')[0] + + try: + return int(id_part) + except ValueError: + print('The ad ID could not be extracted from the given ad reference!') + return -1