mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
ADD download --all feature (#139)
This commit is contained in:
15
README.md
15
README.md
@@ -179,20 +179,25 @@ Usage: kleinanzeigen-bot COMMAND [OPTIONS]
|
|||||||
Commands:
|
Commands:
|
||||||
publish - (re-)publishes ads
|
publish - (re-)publishes ads
|
||||||
verify - verifies the configuration files
|
verify - verifies the configuration files
|
||||||
download - downloads an ad
|
delete - deletes ads
|
||||||
|
download - downloads one or multiple ads
|
||||||
--
|
--
|
||||||
help - displays this help (default command)
|
help - displays this help (default command)
|
||||||
version - displays the application version
|
version - displays the application version
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
|
--ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
|
||||||
Possible values:
|
Possible values:
|
||||||
* all: (re-)publish all ads ignoring republication_interval
|
* all: (re-)publish all ads ignoring republication_interval
|
||||||
* due: publish all new ads and republish ads according the republication_interval
|
* due: publish all new ads and republish ads according the republication_interval
|
||||||
* new: only publish new ads (i.e. ads that have no id in the config file)
|
* new: only publish new ads (i.e. ads that have no id in the config file)
|
||||||
|
--ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
|
||||||
|
Possible values:
|
||||||
|
* all: downloads all ads from your profile
|
||||||
|
* new: downloads ads from your profile that are not locally saved yet
|
||||||
|
* <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
|
||||||
--force - alias for '--ads=all'
|
--force - alias for '--ads=all'
|
||||||
--keep-old - don't delete old ads on republication
|
--keep-old - don't delete old ads on republication
|
||||||
--ad <ID> - provide the ad ID after this option when using the download command
|
|
||||||
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
|
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
|
||||||
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
|
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
|
||||||
-v, --verbose - enables verbose output - only useful when troubleshooting issues
|
-v, --verbose - enables verbose output - only useful when troubleshooting issues
|
||||||
|
|||||||
@@ -3,20 +3,22 @@ Copyright (C) 2022 Sebastian Thomschke and contributors
|
|||||||
SPDX-License-Identifier: AGPL-3.0-or-later
|
SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
"""
|
||||||
import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
|
import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from logging.handlers import RotatingFileHandler
|
from logging.handlers import RotatingFileHandler
|
||||||
from typing import Any, Final
|
from typing import Any, Final
|
||||||
from urllib import request
|
from urllib import request
|
||||||
|
|
||||||
from wcmatch import glob
|
from wcmatch import glob
|
||||||
|
|
||||||
from overrides import overrides
|
from overrides import overrides
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
|
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, \
|
||||||
|
ElementClickInterceptedException
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
|
||||||
from . import utils, resources, extract # pylint: disable=W0406
|
from . import utils, resources, extract # pylint: disable=W0406
|
||||||
from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime
|
from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime
|
||||||
@@ -52,7 +54,6 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
self.ads_selector = "due"
|
self.ads_selector = "due"
|
||||||
self.delete_old_ads = True
|
self.delete_old_ads = True
|
||||||
self.delete_ads_by_title = False
|
self.delete_ads_by_title = False
|
||||||
self.ad_id = None # attribute needed when downloading an ad
|
|
||||||
|
|
||||||
def __del__(self) -> None:
|
def __del__(self) -> None:
|
||||||
if self.file_log:
|
if self.file_log:
|
||||||
@@ -99,24 +100,16 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
LOG.info("############################################")
|
LOG.info("############################################")
|
||||||
case "download":
|
case "download":
|
||||||
self.configure_file_logging()
|
self.configure_file_logging()
|
||||||
# ad ID passed as value to download command
|
# ad IDs depends on selector
|
||||||
if self.ad_id is None:
|
if not (self.ads_selector in {'all', 'new'} or re.compile(r'\d+[,\d+]*').search(self.ads_selector)):
|
||||||
LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!')
|
LOG.warning('You provided no ads selector. Defaulting to "new".')
|
||||||
sys.exit(2)
|
self.ads_selector = 'new'
|
||||||
if self.ad_id < 1:
|
# start session
|
||||||
LOG.error('The given ad ID must be valid!')
|
|
||||||
sys.exit(2)
|
|
||||||
LOG.info('Start fetch task for ad with ID %s', str(self.ad_id))
|
|
||||||
|
|
||||||
self.load_config()
|
self.load_config()
|
||||||
self.create_webdriver_session()
|
self.create_webdriver_session()
|
||||||
self.login()
|
self.login()
|
||||||
# call download function
|
self.start_download_routine() # call correct version of download
|
||||||
exists = self.navigate_to_ad_page()
|
|
||||||
if exists:
|
|
||||||
self.download_ad_page()
|
|
||||||
else:
|
|
||||||
sys.exit(2)
|
|
||||||
case _:
|
case _:
|
||||||
LOG.error("Unknown command: %s", self.command)
|
LOG.error("Unknown command: %s", self.command)
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
@@ -136,20 +129,24 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
publish - (re-)publishes ads
|
publish - (re-)publishes ads
|
||||||
verify - verifies the configuration files
|
verify - verifies the configuration files
|
||||||
delete - deletes ads
|
delete - deletes ads
|
||||||
download - downloads an ad
|
download - downloads one or multiple ads
|
||||||
--
|
--
|
||||||
help - displays this help (default command)
|
help - displays this help (default command)
|
||||||
version - displays the application version
|
version - displays the application version
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
|
--ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
|
||||||
Possible values:
|
Possible values:
|
||||||
* all: (re-)publish all ads ignoring republication_interval
|
* all: (re-)publish all ads ignoring republication_interval
|
||||||
* due: publish all new ads and republish ads according the republication_interval
|
* due: publish all new ads and republish ads according the republication_interval
|
||||||
* new: only publish new ads (i.e. ads that have no id in the config file)
|
* new: only publish new ads (i.e. ads that have no id in the config file)
|
||||||
|
--ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
|
||||||
|
Possible values:
|
||||||
|
* all: downloads all ads from your profile
|
||||||
|
* new: downloads ads from your profile that are not locally saved yet
|
||||||
|
* <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
|
||||||
--force - alias for '--ads=all'
|
--force - alias for '--ads=all'
|
||||||
--keep-old - don't delete old ads on republication
|
--keep-old - don't delete old ads on republication
|
||||||
--ad <ID> - provide the ad ID after this option when using the download command
|
|
||||||
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
|
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
|
||||||
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
|
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
|
||||||
-v, --verbose - enables verbose output - only useful when troubleshooting issues
|
-v, --verbose - enables verbose output - only useful when troubleshooting issues
|
||||||
@@ -163,7 +160,6 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
"force",
|
"force",
|
||||||
"help",
|
"help",
|
||||||
"keep-old",
|
"keep-old",
|
||||||
"ad=",
|
|
||||||
"logfile=",
|
"logfile=",
|
||||||
"verbose"
|
"verbose"
|
||||||
])
|
])
|
||||||
@@ -190,12 +186,6 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
self.ads_selector = "all"
|
self.ads_selector = "all"
|
||||||
case "--keep-old":
|
case "--keep-old":
|
||||||
self.delete_old_ads = False
|
self.delete_old_ads = False
|
||||||
case "--ad":
|
|
||||||
try:
|
|
||||||
self.ad_id:int = int(value)
|
|
||||||
except ValueError: # given value cannot be parsed as integer
|
|
||||||
LOG.error('The given ad ID (\"%s\") is not a valid number!', value)
|
|
||||||
sys.exit(2)
|
|
||||||
case "-v" | "--verbose":
|
case "-v" | "--verbose":
|
||||||
LOG.setLevel(logging.DEBUG)
|
LOG.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
@@ -663,16 +653,30 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
else:
|
else:
|
||||||
raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex
|
raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex
|
||||||
|
|
||||||
def navigate_to_ad_page(self) -> bool:
|
def navigate_to_ad_page(self, id_:int | None = None, url:str | None = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Navigates to an ad page specified with an ad ID.
|
Navigates to an ad page specified with an ad ID; or alternatively by a given URL.
|
||||||
|
|
||||||
|
:param id_: if provided (and no url given), the ID is used to search for the ad to navigate to
|
||||||
|
:param url: if given, this URL is used instead of an id to find the ad page
|
||||||
:return: whether the navigation to the ad page was successful
|
:return: whether the navigation to the ad page was successful
|
||||||
"""
|
"""
|
||||||
# enter the ad ID into the search bar
|
if not (id_ or url):
|
||||||
self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id))
|
raise UserWarning('This function needs either the "id_" or "url" parameter given!')
|
||||||
# navigate to ad page and wait
|
if url:
|
||||||
self.web_click(By.XPATH, '//*[@id="site-search-submit"]')
|
self.webdriver.get(url) # navigate to URL directly given
|
||||||
|
else:
|
||||||
|
# enter the ad ID into the search bar
|
||||||
|
self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(id_))
|
||||||
|
# navigate to ad page and wait
|
||||||
|
submit_button = self.webdriver.find_element(By.XPATH, '//*[@id="site-search-submit"]')
|
||||||
|
WebDriverWait(self.webdriver, 15).until(EC.element_to_be_clickable(submit_button))
|
||||||
|
try:
|
||||||
|
submit_button.click()
|
||||||
|
except ElementClickInterceptedException: # sometimes: special banner might pop up and intercept
|
||||||
|
LOG.warning('Waiting for unexpected element to close...')
|
||||||
|
pause(6000, 10000)
|
||||||
|
submit_button.click()
|
||||||
pause(1000, 2000)
|
pause(1000, 2000)
|
||||||
|
|
||||||
# handle the case that invalid ad ID given
|
# handle the case that invalid ad ID given
|
||||||
@@ -686,7 +690,7 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
close_button.click()
|
close_button.click()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
print('(no popup given)')
|
print('(no popup)')
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
|
def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
|
||||||
@@ -753,11 +757,12 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
|
|
||||||
return img_paths
|
return img_paths
|
||||||
|
|
||||||
def extract_ad_page_info(self, directory:str) -> dict:
|
def extract_ad_page_info(self, directory:str, id_:int) -> dict:
|
||||||
"""
|
"""
|
||||||
Extracts all necessary information from an ad´s page.
|
Extracts all necessary information from an ad´s page.
|
||||||
|
|
||||||
:param directory: the path of the ad´s previously created directory
|
:param directory: the path of the ad´s previously created directory
|
||||||
|
:param id_: the ad ID, already extracted by a calling function
|
||||||
:return: a dictionary with the keys as given in an ad YAML, and their respective values
|
:return: a dictionary with the keys as given in an ad YAML, and their respective values
|
||||||
"""
|
"""
|
||||||
info = {'active': True}
|
info = {'active': True}
|
||||||
@@ -789,14 +794,15 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()
|
info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()
|
||||||
|
|
||||||
# fetch images
|
# fetch images
|
||||||
info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG)
|
info['images'] = self.download_images_from_ad_page(directory, id_, LOG)
|
||||||
|
|
||||||
# process address
|
# process address
|
||||||
info['contact'] = extractor.extract_contact_from_ad_page()
|
info['contact'] = extractor.extract_contact_from_ad_page()
|
||||||
|
|
||||||
# process meta info
|
# process meta info
|
||||||
info['republication_interval'] = 7 # a default value for downloaded ads
|
info['republication_interval'] = 7 # a default value for downloaded ads
|
||||||
info['id'] = self.ad_id
|
info['id'] = id_
|
||||||
|
|
||||||
try: # try different locations known for creation date element
|
try: # try different locations known for creation date element
|
||||||
creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
|
creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
|
||||||
'div[1]/span').text
|
'div[1]/span').text
|
||||||
@@ -812,9 +818,12 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
def download_ad_page(self):
|
def download_ad_page(self, id_:int):
|
||||||
"""
|
"""
|
||||||
Downloads an ad to a specific location, specified by config and ad_id.
|
Downloads an ad to a specific location, specified by config and ad ID.
|
||||||
|
NOTE: Requires that the driver session currently is on the ad page.
|
||||||
|
|
||||||
|
:param id_: the ad ID
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# create sub-directory for ad to download:
|
# create sub-directory for ad to download:
|
||||||
@@ -822,7 +831,8 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
# make sure configured base directory exists
|
# make sure configured base directory exists
|
||||||
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
|
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
|
||||||
os.mkdir(relative_directory)
|
os.mkdir(relative_directory)
|
||||||
new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}')
|
|
||||||
|
new_base_dir = os.path.join(relative_directory, f'ad_{id_}')
|
||||||
if os.path.exists(new_base_dir):
|
if os.path.exists(new_base_dir):
|
||||||
LOG.info('Deleting current folder of ad...')
|
LOG.info('Deleting current folder of ad...')
|
||||||
shutil.rmtree(new_base_dir)
|
shutil.rmtree(new_base_dir)
|
||||||
@@ -830,10 +840,78 @@ class KleinanzeigenBot(SeleniumMixin):
|
|||||||
LOG.info('New directory for ad created at %s.', new_base_dir)
|
LOG.info('New directory for ad created at %s.', new_base_dir)
|
||||||
|
|
||||||
# call extraction function
|
# call extraction function
|
||||||
info = self.extract_ad_page_info(new_base_dir)
|
info = self.extract_ad_page_info(new_base_dir, id_)
|
||||||
ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml'
|
ad_file_path = new_base_dir + '/' + f'ad_{id_}.yaml'
|
||||||
utils.save_dict(ad_file_path, info)
|
utils.save_dict(ad_file_path, info)
|
||||||
|
|
||||||
|
def start_download_routine(self):
|
||||||
|
"""
|
||||||
|
Determines which download mode was chosen with the arguments, and calls the specified download routine.
|
||||||
|
This downloads either all, only unsaved (new), or specific ads given by ID.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# use relevant download routine
|
||||||
|
if self.ads_selector in {'all', 'new'}: # explore ads overview for these two modes
|
||||||
|
LOG.info('Scanning your ad overview...')
|
||||||
|
ext = extract.AdExtractor(self.webdriver)
|
||||||
|
refs = ext.extract_own_ads_references()
|
||||||
|
LOG.info('%d ads were found!', len(refs))
|
||||||
|
|
||||||
|
if self.ads_selector == 'all': # download all of your adds
|
||||||
|
LOG.info('Start fetch task for all your ads!')
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
# call download function for each ad page
|
||||||
|
for ref in refs:
|
||||||
|
ref_ad_id: int = utils.extract_ad_id_from_ad_link(ref)
|
||||||
|
if self.navigate_to_ad_page(url=ref):
|
||||||
|
self.download_ad_page(ref_ad_id)
|
||||||
|
success_count += 1
|
||||||
|
LOG.info("%d of %d ads were downloaded from your profile.", success_count, len(refs))
|
||||||
|
|
||||||
|
elif self.ads_selector == 'new': # download only unsaved ads
|
||||||
|
# determine ad IDs from links
|
||||||
|
ref_ad_ids = [utils.extract_ad_id_from_ad_link(r) for r in refs]
|
||||||
|
ref_pairs = list(zip(refs, ref_ad_ids))
|
||||||
|
|
||||||
|
# check which ads already saved
|
||||||
|
saved_ad_ids = []
|
||||||
|
data_root_dir = os.path.dirname(self.config_file_path)
|
||||||
|
for file_pattern in self.config["ad_files"]:
|
||||||
|
for ad_file in glob.glob(file_pattern, root_dir=os.path.dirname(self.config_file_path),
|
||||||
|
flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB):
|
||||||
|
ad_file_path = abspath(ad_file, relative_to=data_root_dir)
|
||||||
|
ad_dict = utils.load_dict(ad_file_path)
|
||||||
|
ad_id = int(ad_dict['id'])
|
||||||
|
saved_ad_ids.append(ad_id)
|
||||||
|
|
||||||
|
LOG.info('Start fetch task for your unsaved ads!')
|
||||||
|
new_count = 0
|
||||||
|
for ref_pair in ref_pairs:
|
||||||
|
# check if ad with ID already saved
|
||||||
|
id_: int = ref_pair[1]
|
||||||
|
if id_ in saved_ad_ids:
|
||||||
|
LOG.info('The ad with id %d has already been saved.', id_)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if self.navigate_to_ad_page(url=ref_pair[0]):
|
||||||
|
self.download_ad_page(id_)
|
||||||
|
new_count += 1
|
||||||
|
LOG.info('%d new ads were downloaded from your profile.', new_count)
|
||||||
|
|
||||||
|
elif re.compile(r'\d+[,\d+]*').search(self.ads_selector): # download ad(s) with specific id(s)
|
||||||
|
ids = [int(n) for n in self.ads_selector.split(',')]
|
||||||
|
LOG.info('Start fetch task for the ad(s) with the id(s):')
|
||||||
|
LOG.info(' | '.join([str(id_) for id_ in ids]))
|
||||||
|
|
||||||
|
for id_ in ids: # call download routine for every id
|
||||||
|
exists = self.navigate_to_ad_page(id_)
|
||||||
|
if exists:
|
||||||
|
self.download_ad_page(id_)
|
||||||
|
LOG.info('Downloaded ad with id %d', id_)
|
||||||
|
else:
|
||||||
|
LOG.error('The page with the id %d does not exist!', id_)
|
||||||
|
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
# main entry point
|
# main entry point
|
||||||
|
|||||||
@@ -2,14 +2,16 @@
|
|||||||
Copyright (C) 2022 Sebastian Thomschke and contributors
|
Copyright (C) 2022 Sebastian Thomschke and contributors
|
||||||
SPDX-License-Identifier: AGPL-3.0-or-later
|
SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
"""
|
"""
|
||||||
from decimal import DecimalException
|
|
||||||
import json
|
import json
|
||||||
|
from decimal import DecimalException
|
||||||
|
|
||||||
|
import selenium.webdriver.support.expected_conditions as EC
|
||||||
from selenium.common.exceptions import NoSuchElementException
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.remote.webdriver import WebDriver
|
from selenium.webdriver.remote.webdriver import WebDriver
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
|
||||||
from .utils import parse_decimal
|
from .utils import parse_decimal, pause, smooth_scroll_page
|
||||||
|
|
||||||
|
|
||||||
class AdExtractor:
|
class AdExtractor:
|
||||||
@@ -147,3 +149,57 @@ class AdExtractor:
|
|||||||
# also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
|
# also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
|
||||||
|
|
||||||
return contact
|
return contact
|
||||||
|
|
||||||
|
def extract_own_ads_references(self) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extracts the references to all own ads.
|
||||||
|
|
||||||
|
:return: the links to your ad pages
|
||||||
|
"""
|
||||||
|
# navigate to your ads page
|
||||||
|
self.driver.get('https://www.ebay-kleinanzeigen.de/m-meine-anzeigen.html')
|
||||||
|
WebDriverWait(self.driver, 15).until(EC.url_contains('meine-anzeigen'))
|
||||||
|
pause(2000, 3000)
|
||||||
|
|
||||||
|
# collect ad references:
|
||||||
|
|
||||||
|
pagination_section = self.driver.find_element(By.CSS_SELECTOR, 'section.jsx-1105488430:nth-child(4)')
|
||||||
|
# scroll down to load dynamically
|
||||||
|
smooth_scroll_page(self.driver)
|
||||||
|
pause(2000, 3000)
|
||||||
|
# detect multi-page
|
||||||
|
try:
|
||||||
|
pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div') # Pagination
|
||||||
|
except NoSuchElementException: # 0 ads - no pagination area
|
||||||
|
print('There currently seem to be no ads on your profile!')
|
||||||
|
return []
|
||||||
|
|
||||||
|
n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button'))
|
||||||
|
multi_page:bool
|
||||||
|
if n_buttons > 1:
|
||||||
|
multi_page = True
|
||||||
|
print('It seems like you have many ads!')
|
||||||
|
else:
|
||||||
|
multi_page = False
|
||||||
|
print('It seems like all your ads fit on one overview page.')
|
||||||
|
|
||||||
|
refs:list[str] = []
|
||||||
|
while True: # loop reference extraction until no more forward page
|
||||||
|
# extract references
|
||||||
|
list_section = self.driver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]')
|
||||||
|
list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox')
|
||||||
|
refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items]
|
||||||
|
|
||||||
|
if not multi_page: # only one iteration for single-page overview
|
||||||
|
break
|
||||||
|
# check if last page
|
||||||
|
nav_button = self.driver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1]
|
||||||
|
if nav_button.get_attribute('title') != 'Nächste':
|
||||||
|
print('Last ad overview page explored.')
|
||||||
|
break
|
||||||
|
# navigate to next overview page
|
||||||
|
nav_button.click()
|
||||||
|
pause(2000, 3000)
|
||||||
|
smooth_scroll_page(self.driver)
|
||||||
|
|
||||||
|
return refs
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from typing import Any, Final, TypeVar
|
|||||||
|
|
||||||
import coloredlogs, inflect
|
import coloredlogs, inflect
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
|
from selenium.webdriver.chrome.webdriver import WebDriver
|
||||||
|
|
||||||
LOG_ROOT:Final[logging.Logger] = logging.getLogger()
|
LOG_ROOT:Final[logging.Logger] = logging.getLogger()
|
||||||
LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils")
|
LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils")
|
||||||
@@ -270,3 +271,42 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
|
|||||||
if isinstance(date, datetime):
|
if isinstance(date, datetime):
|
||||||
return date
|
return date
|
||||||
return datetime.fromisoformat(date)
|
return datetime.fromisoformat(date)
|
||||||
|
|
||||||
|
|
||||||
|
def smooth_scroll_page(driver: WebDriver, scroll_length: int = 10, scroll_speed: int = 10000, scroll_back_top: bool = False):
|
||||||
|
"""
|
||||||
|
Scrolls the current page of a web driver session.
|
||||||
|
:param driver: the web driver session
|
||||||
|
:param scroll_length: the length of a single scroll iteration, determines smoothness of scrolling, lower is smoother
|
||||||
|
:param scroll_speed: the speed of scrolling, higher is faster
|
||||||
|
:param scroll_back_top: whether to scroll the page back to the top after scrolling to the bottom
|
||||||
|
"""
|
||||||
|
current_y_pos = 0
|
||||||
|
bottom_y_pos: int = driver.execute_script('return document.body.scrollHeight;') # get bottom position by JS
|
||||||
|
while current_y_pos < bottom_y_pos: # scroll in steps until bottom reached
|
||||||
|
current_y_pos += scroll_length
|
||||||
|
driver.execute_script(f'window.scrollTo(0, {current_y_pos});') # scroll one step
|
||||||
|
time.sleep(scroll_length / scroll_speed)
|
||||||
|
|
||||||
|
if scroll_back_top: # scroll back to top in same style
|
||||||
|
while current_y_pos > 0:
|
||||||
|
current_y_pos -= scroll_length
|
||||||
|
driver.execute_script(f'window.scrollTo(0, {current_y_pos});')
|
||||||
|
time.sleep(scroll_length / scroll_speed / 2) # double speed
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ad_id_from_ad_link(url: str) -> int:
|
||||||
|
"""
|
||||||
|
Extracts the ID of an ad, given by its reference link.
|
||||||
|
|
||||||
|
:param url: the URL to the ad page
|
||||||
|
:return: the ad ID, a (ten-digit) integer number
|
||||||
|
"""
|
||||||
|
num_part = url.split('/')[-1] # suffix
|
||||||
|
id_part = num_part.split('-')[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
return int(id_part)
|
||||||
|
except ValueError:
|
||||||
|
print('The ad ID could not be extracted from the given ad reference!')
|
||||||
|
return -1
|
||||||
|
|||||||
Reference in New Issue
Block a user