ADD download --all feature (#139)

This commit is contained in:
PhilK-7
2022-11-22 18:52:32 +01:00
committed by GitHub
parent 23682edb5c
commit 70a23dbcc7
4 changed files with 231 additions and 52 deletions

View File

@@ -179,20 +179,25 @@ Usage: kleinanzeigen-bot COMMAND [OPTIONS]
Commands:
publish - (re-)publishes ads
verify - verifies the configuration files
download - downloads an ad
delete - deletes ads
download - downloads one or multiple ads
--
help - displays this help (default command)
version - displays the application version
help - displays this help (default command)
version - displays the application version
Options:
--ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
--ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
Possible values:
* all: (re-)publish all ads ignoring republication_interval
* due: publish all new ads and republish ads according the republication_interval
* new: only publish new ads (i.e. ads that have no id in the config file)
--ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
Possible values:
* all: downloads all ads from your profile
* new: downloads ads from your profile that are not locally saved yet
* <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
--force - alias for '--ads=all'
--keep-old - don't delete old ads on republication
--ad <ID> - provide the ad ID after this option when using the download command
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
-v, --verbose - enables verbose output - only useful when troubleshooting issues

View File

@@ -3,20 +3,22 @@ Copyright (C) 2022 Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later
"""
import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
import re
import shutil
from collections.abc import Iterable
from datetime import datetime
from logging.handlers import RotatingFileHandler
from typing import Any, Final
from urllib import request
from wcmatch import glob
from overrides import overrides
from ruamel.yaml import YAML
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, \
ElementClickInterceptedException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from . import utils, resources, extract # pylint: disable=W0406
from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get, parse_datetime
@@ -52,7 +54,6 @@ class KleinanzeigenBot(SeleniumMixin):
self.ads_selector = "due"
self.delete_old_ads = True
self.delete_ads_by_title = False
self.ad_id = None # attribute needed when downloading an ad
def __del__(self) -> None:
if self.file_log:
@@ -99,24 +100,16 @@ class KleinanzeigenBot(SeleniumMixin):
LOG.info("############################################")
case "download":
self.configure_file_logging()
# ad ID passed as value to download command
if self.ad_id is None:
LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!')
sys.exit(2)
if self.ad_id < 1:
LOG.error('The given ad ID must be valid!')
sys.exit(2)
LOG.info('Start fetch task for ad with ID %s', str(self.ad_id))
# ad IDs depends on selector
if not (self.ads_selector in {'all', 'new'} or re.compile(r'\d+[,\d+]*').search(self.ads_selector)):
LOG.warning('You provided no ads selector. Defaulting to "new".')
self.ads_selector = 'new'
# start session
self.load_config()
self.create_webdriver_session()
self.login()
# call download function
exists = self.navigate_to_ad_page()
if exists:
self.download_ad_page()
else:
sys.exit(2)
self.start_download_routine() # call correct version of download
case _:
LOG.error("Unknown command: %s", self.command)
sys.exit(2)
@@ -136,20 +129,24 @@ class KleinanzeigenBot(SeleniumMixin):
publish - (re-)publishes ads
verify - verifies the configuration files
delete - deletes ads
download - downloads an ad
download - downloads one or multiple ads
--
help - displays this help (default command)
version - displays the application version
help - displays this help (default command)
version - displays the application version
Options:
--ads=all|due|new - specifies which ads to (re-)publish (DEFAULT: due)
--ads=all|due|new (publish) - specifies which ads to (re-)publish (DEFAULT: due)
Possible values:
* all: (re-)publish all ads ignoring republication_interval
* due: publish all new ads and republish ads according the republication_interval
* new: only publish new ads (i.e. ads that have no id in the config file)
--ads=all|new|<id(s)> (download) - specifies which ads to download (DEFAULT: new)
Possible values:
* all: downloads all ads from your profile
* new: downloads ads from your profile that are not locally saved yet
* <id(s)>: provide one or several ads by ID to download, like e.g. "--ads=1,2,3"
--force - alias for '--ads=all'
--keep-old - don't delete old ads on republication
--ad <ID> - provide the ad ID after this option when using the download command
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
-v, --verbose - enables verbose output - only useful when troubleshooting issues
@@ -163,7 +160,6 @@ class KleinanzeigenBot(SeleniumMixin):
"force",
"help",
"keep-old",
"ad=",
"logfile=",
"verbose"
])
@@ -190,12 +186,6 @@ class KleinanzeigenBot(SeleniumMixin):
self.ads_selector = "all"
case "--keep-old":
self.delete_old_ads = False
case "--ad":
try:
self.ad_id:int = int(value)
except ValueError: # given value cannot be parsed as integer
LOG.error('The given ad ID (\"%s\") is not a valid number!', value)
sys.exit(2)
case "-v" | "--verbose":
LOG.setLevel(logging.DEBUG)
@@ -663,16 +653,30 @@ class KleinanzeigenBot(SeleniumMixin):
else:
raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex
def navigate_to_ad_page(self) -> bool:
def navigate_to_ad_page(self, id_:int | None = None, url:str | None = None) -> bool:
"""
Navigates to an ad page specified with an ad ID.
Navigates to an ad page specified with an ad ID; or alternatively by a given URL.
:param id_: if provided (and no url given), the ID is used to search for the ad to navigate to
:param url: if given, this URL is used instead of an id to find the ad page
:return: whether the navigation to the ad page was successful
"""
# enter the ad ID into the search bar
self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id))
# navigate to ad page and wait
self.web_click(By.XPATH, '//*[@id="site-search-submit"]')
if not (id_ or url):
raise UserWarning('This function needs either the "id_" or "url" parameter given!')
if url:
self.webdriver.get(url) # navigate to URL directly given
else:
# enter the ad ID into the search bar
self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(id_))
# navigate to ad page and wait
submit_button = self.webdriver.find_element(By.XPATH, '//*[@id="site-search-submit"]')
WebDriverWait(self.webdriver, 15).until(EC.element_to_be_clickable(submit_button))
try:
submit_button.click()
except ElementClickInterceptedException: # sometimes: special banner might pop up and intercept
LOG.warning('Waiting for unexpected element to close...')
pause(6000, 10000)
submit_button.click()
pause(1000, 2000)
# handle the case that invalid ad ID given
@@ -686,7 +690,7 @@ class KleinanzeigenBot(SeleniumMixin):
close_button.click()
time.sleep(1)
except NoSuchElementException:
print('(no popup given)')
print('(no popup)')
return True
def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
@@ -753,11 +757,12 @@ class KleinanzeigenBot(SeleniumMixin):
return img_paths
def extract_ad_page_info(self, directory:str) -> dict:
def extract_ad_page_info(self, directory:str, id_:int) -> dict:
"""
Extracts all necessary information from an ad´s page.
:param directory: the path of the ad´s previously created directory
:param id_: the ad ID, already extracted by a calling function
:return: a dictionary with the keys as given in an ad YAML, and their respective values
"""
info = {'active': True}
@@ -789,14 +794,15 @@ class KleinanzeigenBot(SeleniumMixin):
info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()
# fetch images
info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG)
info['images'] = self.download_images_from_ad_page(directory, id_, LOG)
# process address
info['contact'] = extractor.extract_contact_from_ad_page()
# process meta info
info['republication_interval'] = 7 # a default value for downloaded ads
info['id'] = self.ad_id
info['id'] = id_
try: # try different locations known for creation date element
creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
'div[1]/span').text
@@ -812,9 +818,12 @@ class KleinanzeigenBot(SeleniumMixin):
return info
def download_ad_page(self):
def download_ad_page(self, id_:int):
"""
Downloads an ad to a specific location, specified by config and ad_id.
Downloads an ad to a specific location, specified by config and ad ID.
NOTE: Requires that the driver session currently is on the ad page.
:param id_: the ad ID
"""
# create sub-directory for ad to download:
@@ -822,7 +831,8 @@ class KleinanzeigenBot(SeleniumMixin):
# make sure configured base directory exists
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
os.mkdir(relative_directory)
new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}')
new_base_dir = os.path.join(relative_directory, f'ad_{id_}')
if os.path.exists(new_base_dir):
LOG.info('Deleting current folder of ad...')
shutil.rmtree(new_base_dir)
@@ -830,10 +840,78 @@ class KleinanzeigenBot(SeleniumMixin):
LOG.info('New directory for ad created at %s.', new_base_dir)
# call extraction function
info = self.extract_ad_page_info(new_base_dir)
ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml'
info = self.extract_ad_page_info(new_base_dir, id_)
ad_file_path = new_base_dir + '/' + f'ad_{id_}.yaml'
utils.save_dict(ad_file_path, info)
def start_download_routine(self):
"""
Determines which download mode was chosen with the arguments, and calls the specified download routine.
This downloads either all, only unsaved (new), or specific ads given by ID.
"""
# use relevant download routine
if self.ads_selector in {'all', 'new'}: # explore ads overview for these two modes
LOG.info('Scanning your ad overview...')
ext = extract.AdExtractor(self.webdriver)
refs = ext.extract_own_ads_references()
LOG.info('%d ads were found!', len(refs))
if self.ads_selector == 'all': # download all of your adds
LOG.info('Start fetch task for all your ads!')
success_count = 0
# call download function for each ad page
for ref in refs:
ref_ad_id: int = utils.extract_ad_id_from_ad_link(ref)
if self.navigate_to_ad_page(url=ref):
self.download_ad_page(ref_ad_id)
success_count += 1
LOG.info("%d of %d ads were downloaded from your profile.", success_count, len(refs))
elif self.ads_selector == 'new': # download only unsaved ads
# determine ad IDs from links
ref_ad_ids = [utils.extract_ad_id_from_ad_link(r) for r in refs]
ref_pairs = list(zip(refs, ref_ad_ids))
# check which ads already saved
saved_ad_ids = []
data_root_dir = os.path.dirname(self.config_file_path)
for file_pattern in self.config["ad_files"]:
for ad_file in glob.glob(file_pattern, root_dir=os.path.dirname(self.config_file_path),
flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB):
ad_file_path = abspath(ad_file, relative_to=data_root_dir)
ad_dict = utils.load_dict(ad_file_path)
ad_id = int(ad_dict['id'])
saved_ad_ids.append(ad_id)
LOG.info('Start fetch task for your unsaved ads!')
new_count = 0
for ref_pair in ref_pairs:
# check if ad with ID already saved
id_: int = ref_pair[1]
if id_ in saved_ad_ids:
LOG.info('The ad with id %d has already been saved.', id_)
continue
if self.navigate_to_ad_page(url=ref_pair[0]):
self.download_ad_page(id_)
new_count += 1
LOG.info('%d new ads were downloaded from your profile.', new_count)
elif re.compile(r'\d+[,\d+]*').search(self.ads_selector): # download ad(s) with specific id(s)
ids = [int(n) for n in self.ads_selector.split(',')]
LOG.info('Start fetch task for the ad(s) with the id(s):')
LOG.info(' | '.join([str(id_) for id_ in ids]))
for id_ in ids: # call download routine for every id
exists = self.navigate_to_ad_page(id_)
if exists:
self.download_ad_page(id_)
LOG.info('Downloaded ad with id %d', id_)
else:
LOG.error('The page with the id %d does not exist!', id_)
#############################
# main entry point

View File

@@ -2,14 +2,16 @@
Copyright (C) 2022 Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later
"""
from decimal import DecimalException
import json
from decimal import DecimalException
import selenium.webdriver.support.expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support.wait import WebDriverWait
from .utils import parse_decimal
from .utils import parse_decimal, pause, smooth_scroll_page
class AdExtractor:
@@ -147,3 +149,57 @@ class AdExtractor:
# also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
return contact
def extract_own_ads_references(self) -> list[str]:
"""
Extracts the references to all own ads.
:return: the links to your ad pages
"""
# navigate to your ads page
self.driver.get('https://www.ebay-kleinanzeigen.de/m-meine-anzeigen.html')
WebDriverWait(self.driver, 15).until(EC.url_contains('meine-anzeigen'))
pause(2000, 3000)
# collect ad references:
pagination_section = self.driver.find_element(By.CSS_SELECTOR, 'section.jsx-1105488430:nth-child(4)')
# scroll down to load dynamically
smooth_scroll_page(self.driver)
pause(2000, 3000)
# detect multi-page
try:
pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div') # Pagination
except NoSuchElementException: # 0 ads - no pagination area
print('There currently seem to be no ads on your profile!')
return []
n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button'))
multi_page:bool
if n_buttons > 1:
multi_page = True
print('It seems like you have many ads!')
else:
multi_page = False
print('It seems like all your ads fit on one overview page.')
refs:list[str] = []
while True: # loop reference extraction until no more forward page
# extract references
list_section = self.driver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]')
list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox')
refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items]
if not multi_page: # only one iteration for single-page overview
break
# check if last page
nav_button = self.driver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1]
if nav_button.get_attribute('title') != 'Nächste':
print('Last ad overview page explored.')
break
# navigate to next overview page
nav_button.click()
pause(2000, 3000)
smooth_scroll_page(self.driver)
return refs

View File

@@ -11,6 +11,7 @@ from typing import Any, Final, TypeVar
import coloredlogs, inflect
from ruamel.yaml import YAML
from selenium.webdriver.chrome.webdriver import WebDriver
LOG_ROOT:Final[logging.Logger] = logging.getLogger()
LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot.utils")
@@ -270,3 +271,42 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
if isinstance(date, datetime):
return date
return datetime.fromisoformat(date)
def smooth_scroll_page(driver: WebDriver, scroll_length: int = 10, scroll_speed: int = 10000, scroll_back_top: bool = False):
"""
Scrolls the current page of a web driver session.
:param driver: the web driver session
:param scroll_length: the length of a single scroll iteration, determines smoothness of scrolling, lower is smoother
:param scroll_speed: the speed of scrolling, higher is faster
:param scroll_back_top: whether to scroll the page back to the top after scrolling to the bottom
"""
current_y_pos = 0
bottom_y_pos: int = driver.execute_script('return document.body.scrollHeight;') # get bottom position by JS
while current_y_pos < bottom_y_pos: # scroll in steps until bottom reached
current_y_pos += scroll_length
driver.execute_script(f'window.scrollTo(0, {current_y_pos});') # scroll one step
time.sleep(scroll_length / scroll_speed)
if scroll_back_top: # scroll back to top in same style
while current_y_pos > 0:
current_y_pos -= scroll_length
driver.execute_script(f'window.scrollTo(0, {current_y_pos});')
time.sleep(scroll_length / scroll_speed / 2) # double speed
def extract_ad_id_from_ad_link(url: str) -> int:
"""
Extracts the ID of an ad, given by its reference link.
:param url: the URL to the ad page
:return: the ad ID, a (ten-digit) integer number
"""
num_part = url.split('/')[-1] # suffix
id_part = num_part.split('-')[0]
try:
return int(id_part)
except ValueError:
print('The ad ID could not be extracted from the given ad reference!')
return -1