Add download command. Fixes #32 (#114)

This commit is contained in:
PhilK-7
2022-10-25 22:27:10 +02:00
committed by GitHub
parent 963d7dc3db
commit 5321b7cd12
3 changed files with 352 additions and 6 deletions

View File

@@ -177,8 +177,9 @@ It is the spiritual successor to [Second-Hand-Friends/ebayKleinanzeigen](https:/
Usage: kleinanzeigen-bot COMMAND [OPTIONS]
Commands:
publish - (re-)publishes ads
verify - verifies the configuration files
publish - (re-)publishes ads
verify - verifies the configuration files
download - downloads an ad
--
help - displays this help (default command)
version - displays the application version
@@ -191,6 +192,7 @@ Options:
* new: only publish new ads (i.e. ads that have no id in the config file)
--force - alias for '--ads=all'
--keep-old - don't delete old ads on republication
--ad <ID> - provide the ad ID after this option when using the download command
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
-v, --verbose - enables verbose output - only useful when troubleshooting issues

View File

@@ -3,10 +3,13 @@ Copyright (C) 2022 Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later
"""
import atexit, copy, getopt, importlib.metadata, json, logging, os, signal, sys, textwrap, time, urllib
import shutil
from collections.abc import Iterable
from datetime import datetime
from logging.handlers import RotatingFileHandler
from typing import Any, Final
from urllib import request
from wcmatch import glob
from overrides import overrides
@@ -15,10 +18,12 @@ from selenium.common.exceptions import NoSuchElementException, TimeoutException,
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from . import utils, resources
from . import utils, resources, extract # pylint: disable=W0406
from .utils import abspath, apply_defaults, ensure, is_frozen, pause, pluralize, safe_get
from .selenium_mixin import SeleniumMixin
# W0406: possibly a bug, see https://github.com/PyCQA/pylint/issues/3933
LOG_ROOT:Final[logging.Logger] = logging.getLogger()
LOG:Final[logging.Logger] = logging.getLogger("kleinanzeigen_bot")
LOG.setLevel(logging.INFO)
@@ -47,6 +52,7 @@ class KleinanzeigenBot(SeleniumMixin):
self.ads_selector = "due"
self.delete_old_ads = True
self.delete_ads_by_title = False
self.ad_id = None # attribute needed when downloading an ad
def __del__(self) -> None:
if self.file_log:
@@ -91,6 +97,26 @@ class KleinanzeigenBot(SeleniumMixin):
LOG.info("############################################")
LOG.info("DONE: No ads to delete found.")
LOG.info("############################################")
case "download":
self.configure_file_logging()
# ad ID passed as value to download command
if self.ad_id is None:
LOG.error('Provide the flag \'--ad\' with a valid ad ID to use the download command!')
sys.exit(2)
if self.ad_id < 1:
LOG.error('The given ad ID must be valid!')
sys.exit(2)
LOG.info('Start fetch task for ad with ID %s', str(self.ad_id))
self.load_config()
self.create_webdriver_session()
self.login()
# call download function
exists = self.navigate_to_ad_page()
if exists:
self.download_ad_page()
else:
sys.exit(2)
case _:
LOG.error("Unknown command: %s", self.command)
sys.exit(2)
@@ -107,9 +133,10 @@ class KleinanzeigenBot(SeleniumMixin):
Usage: {exe} COMMAND [OPTIONS]
Commands:
publish - (re-)publishes ads
verify - verifies the configuration files
delete - deletes ads
publish - (re-)publishes ads
verify - verifies the configuration files
delete - deletes ads
download - downloads an ad
--
help - displays this help (default command)
version - displays the application version
@@ -122,6 +149,7 @@ class KleinanzeigenBot(SeleniumMixin):
* new: only publish new ads (i.e. ads that have no id in the config file)
--force - alias for '--ads=all'
--keep-old - don't delete old ads on republication
--ad <ID> - provide the ad ID after this option when using the download command
--config=<PATH> - path to the config YAML or JSON file (DEFAULT: ./config.yaml)
--logfile=<PATH> - path to the logfile (DEFAULT: ./kleinanzeigen-bot.log)
-v, --verbose - enables verbose output - only useful when troubleshooting issues
@@ -135,6 +163,7 @@ class KleinanzeigenBot(SeleniumMixin):
"force",
"help",
"keep-old",
"ad=",
"logfile=",
"verbose"
])
@@ -161,6 +190,12 @@ class KleinanzeigenBot(SeleniumMixin):
self.ads_selector = "all"
case "--keep-old":
self.delete_old_ads = False
case "--ad":
try:
self.ad_id:int = int(value)
except ValueError: # given value cannot be parsed as integer
LOG.error('The given ad ID (\"%s\") is not a valid number!', value)
sys.exit(2)
case "-v" | "--verbose":
LOG.setLevel(logging.DEBUG)
@@ -628,6 +663,171 @@ class KleinanzeigenBot(SeleniumMixin):
else:
raise TimeoutException("Loading page failed, it still shows fullscreen ad.") from ex
def navigate_to_ad_page(self) -> bool:
"""
Navigates to an ad page specified with an ad ID.
:return: whether the navigation to the ad page was successful
"""
# enter the ad ID into the search bar
self.web_input(By.XPATH, '//*[@id="site-search-query"]', str(self.ad_id))
# navigate to ad page and wait
self.web_click(By.XPATH, '//*[@id="site-search-submit"]')
pause(1000, 2000)
# handle the case that invalid ad ID given
if self.webdriver.current_url.endswith('k0'):
LOG.error('There is no ad under the given ID.')
return False
try: # close (warning) popup, if given
self.webdriver.find_element(By.CSS_SELECTOR, '#vap-ovrly-secure')
LOG.warning('A popup appeared.')
close_button = self.webdriver.find_element(By.CLASS_NAME, 'mfp-close')
close_button.click()
time.sleep(1)
except NoSuchElementException:
print('(no popup given)')
return True
def download_images_from_ad_page(self, directory:str, ad_id:int, logger:logging.Logger) -> list[str]:
"""
Downloads all images of an ad.
:param directory: the path of the directory created for this ad
:param ad_id: the ID of the ad to download the images from
:param logger: an initialized logger
:return: the relative paths for all downloaded images
"""
n_images:int
img_paths = []
try:
image_box = self.webdriver.find_element(By.CSS_SELECTOR, '.galleryimage-large')
# if gallery image box exists, proceed with image fetching
n_images = 1
# determine number of images (1 ... N)
next_button = None
try: # check if multiple images given
image_counter = image_box.find_element(By.CSS_SELECTOR, '.galleryimage--info')
n_images = int(image_counter.text[2:])
logger.info('Found %d images.', n_images)
next_button = self.webdriver.find_element(By.CSS_SELECTOR, '.galleryimage--navigation--next')
except NoSuchElementException:
logger.info('Only one image found.')
# download all images from box
img_element = image_box.find_element(By.XPATH, './/div[1]/img')
img_fn_prefix = 'ad_' + str(ad_id) + '__img'
img_nr = 1
dl_counter = 0
while img_nr <= n_images: # scrolling + downloading
current_img_url = img_element.get_attribute('src') # URL of the image
file_ending = current_img_url.split('.')[-1].lower()
img_path = directory + '/' + img_fn_prefix + str(img_nr) + '.' + file_ending
if current_img_url.startswith('https'): # verify https (for Bandit linter)
request.urlretrieve(current_img_url, img_path) # nosec B310
dl_counter += 1
img_paths.append(img_path.split('/')[-1])
# scroll to next image (if exists)
if img_nr < n_images:
try:
# click next button, wait, and reestablish reference
next_button.click()
self.web_await(lambda _: EC.staleness_of(img_element))
new_div = self.webdriver.find_element(By.CSS_SELECTOR,f'div.galleryimage-element:nth-child({img_nr + 1})')
img_element = new_div.find_element(By.XPATH, './/img')
except NoSuchElementException:
logger.error('NEXT button in image gallery somehow missing, abort image fetching.')
break
img_nr += 1
logger.info('Downloaded %d image(s).', dl_counter)
except NoSuchElementException: # some ads do not require images
logger.warning('No image area found. Continue without downloading images.')
return img_paths
def extract_ad_page_info(self, directory:str) -> dict:
"""
Extracts all necessary information from an ad´s page.
:param directory: the path of the ad´s previously created directory
:return: a dictionary with the keys as given in an ad YAML, and their respective values
"""
info = {'active': True}
# extract basic info
if 's-anzeige' in self.webdriver.current_url:
o_type = 'OFFER'
else:
o_type = 'WANTED'
info['type'] = o_type
title:str = self.webdriver.find_element(By.CSS_SELECTOR, '#viewad-title').text
LOG.info('Extracting information from ad with title \"%s\"', title)
info['title'] = title
descr:str = self.webdriver.find_element(By.XPATH, '//*[@id="viewad-description-text"]').text
info['description'] = descr
extractor = extract.AdExtractor(self.webdriver)
# extract category
info['category'] = extractor.extract_category_from_ad_page()
# get special attributes
info['special_attributes'] = extractor.extract_special_attributes_from_ad_page()
# process pricing
info['price'], info['price_type'] = extractor.extract_pricing_info_from_ad_page()
# process shipping
info['shipping_type'], info['shipping_costs'] = extractor.extract_shipping_info_from_ad_page()
# fetch images
info['images'] = self.download_images_from_ad_page(directory, self.ad_id, LOG)
# process address
info['contact'] = extractor.extract_contact_from_ad_page()
# process meta info
info['republication_interval'] = 7 # a default value for downloaded ads
info['id'] = self.ad_id
try: # try different locations known for creation date element
creation_date = self.webdriver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/section[2]/section/section/article/div[3]/div[2]/div[2]/'
'div[1]/span').text
except NoSuchElementException:
creation_date = self.webdriver.find_element(By.CSS_SELECTOR, '#viewad-extra-info > div:nth-child(1) > span:nth-child(2)').text
# convert creation date to ISO format
created_parts = creation_date.split('.')
creation_date = created_parts[2] + '-' + created_parts[1] + '-' + created_parts[0] + ' 00:00:00'
info['created_on'] = datetime.fromisoformat(creation_date)
info['updated_on'] = None # will be set later on
return info
def download_ad_page(self):
"""
Downloads an ad to a specific location, specified by config and ad_id.
"""
# create sub-directory for ad to download
relative_directory = str(self.config['ad_files'][0]).split('**', maxsplit=1)[0]
new_base_dir = os.path.join(relative_directory, f'ad_{self.ad_id}')
if os.path.exists(new_base_dir):
LOG.info('Deleting current folder of ad...')
shutil.rmtree(new_base_dir)
os.mkdir(new_base_dir)
LOG.info('New directory for ad created at %s.', new_base_dir)
# call extraction function
info = self.extract_ad_page_info(new_base_dir)
ad_file_path = new_base_dir + '/' + f'ad_{self.ad_id}.yaml'
utils.save_dict(ad_file_path, info)
#############################
# main entry point

View File

@@ -0,0 +1,144 @@
"""
Copyright (C) 2022 Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later
"""
from decimal import DecimalException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webdriver import WebDriver
from .utils import parse_decimal
class AdExtractor:
"""
Wrapper class for ad extraction that uses an active bot´s web driver to extract specific elements from an ad page.
"""
def __init__(self, driver:WebDriver):
self.driver = driver
def extract_category_from_ad_page(self) -> str:
"""
Extracts a category of an ad in numerical form.
Assumes that the web driver currently shows an ad page.
:return: a category string of form abc/def, where a-f are digits
"""
category_line = self.driver.find_element(By.XPATH, '//*[@id="vap-brdcrmb"]')
category_first_part = category_line.find_element(By.XPATH, './/a[2]')
category_second_part = category_line.find_element(By.XPATH, './/a[3]')
cat_num_first = category_first_part.get_attribute('href').split('/')[-1][1:]
cat_num_second = category_second_part.get_attribute('href').split('/')[-1][1:]
category:str = cat_num_first + '/' + cat_num_second
return category
def extract_special_attributes_from_ad_page(self) -> dict:
"""
Extracts the special attributes from an ad page.
:return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
"""
try:
details_box = self.driver.find_element(By.CSS_SELECTOR, '#viewad-details')
details_list = details_box.find_element(By.XPATH, './/ul')
list_items = details_list.find_elements(By.TAG_NAME, 'li')
details = {}
for list_item in list_items:
detail_key = list_item.text.split('\n')[0]
detail_value = list_item.find_element(By.TAG_NAME, 'span').text
details[detail_key] = detail_value
return details
except NoSuchElementException:
return {}
def extract_pricing_info_from_ad_page(self) -> (float | None, str):
"""
Extracts the pricing information (price and pricing type) from an ad page.
:return: the price of the offer (optional); and the pricing type
"""
try:
price_str:str = self.driver.find_element(By.CLASS_NAME, 'boxedarticle--price').text
price_type:str
price:float | None = -1
match price_str.split()[-1]:
case '':
price_type = 'FIXED'
price = float(parse_decimal(price_str.split()[0].replace('.', '')))
case 'VB': # can be either 'X € VB', or just 'VB'
price_type = 'NEGOTIABLE'
try:
price = float(parse_decimal(price_str.split()[0].replace('.', '')))
except DecimalException:
price = None
case 'verschenken':
price_type = 'GIVE_AWAY'
price = None
case _:
price_type = 'NOT_APPLICABLE'
return price, price_type
except NoSuchElementException: # no 'commercial' ad, has no pricing box etc.
return None, 'NOT_APPLICABLE'
def extract_shipping_info_from_ad_page(self) -> (str, float | None):
"""
Extracts shipping information from an ad page.
:return: the shipping type, and the shipping price (optional)
"""
ship_type, ship_costs = 'NOT_APPLICABLE', None
try:
shipping_text = self.driver.find_element(By.CSS_SELECTOR, '.boxedarticle--details--shipping') \
.text.strip()
# e.g. '+ Versand ab 5,49 €' OR 'Nur Abholung'
if shipping_text == 'Nur Abholung':
ship_type = 'PICKUP'
elif shipping_text == 'Versand möglich':
ship_type = 'SHIPPING'
elif '' in shipping_text:
shipping_price_parts = shipping_text.split(' ')
shipping_price = float(parse_decimal(shipping_price_parts[-2]))
ship_type = 'SHIPPING'
ship_costs = shipping_price
except NoSuchElementException: # no pricing box -> no shipping given
ship_type = 'NOT_APPLICABLE'
return ship_type, ship_costs
def extract_contact_from_ad_page(self) -> dict:
"""
Processes the address part involving street (optional), zip code + city, and phone number (optional).
:return: a dictionary containing the address parts with their corresponding values
"""
contact = {}
address_element = self.driver.find_element(By.CSS_SELECTOR, '#viewad-locality')
address_text = address_element.text.strip()
# format: e.g. (Beispiel Allee 42,) 12345 Bundesland - Stadt
try:
street_element = self.driver.find_element(By.XPATH, '//*[@id="street-address"]')
street = street_element.text[:-2] # trailing comma and whitespace
contact['street'] = street
except NoSuchElementException:
print('No street given in the contact.')
# construct remaining address
address_halves = address_text.split(' - ')
address_left_parts = address_halves[0].split(' ') # zip code and region/city
contact['zipcode'] = address_left_parts[0]
contact['name'] = address_halves[1]
if 'street' not in contact:
contact['street'] = None
try: # phone number is unusual for non-professional sellers today
phone_element = self.driver.find_element(By.CSS_SELECTOR, '#viewad-contact-phone')
phone_number = phone_element.find_element(By.TAG_NAME, 'a').text
contact['phone'] = ''.join(phone_number.replace('-', ' ').split(' ')).replace('+49(0)', '0')
except NoSuchElementException:
contact['phone'] = None # phone seems to be a deprecated feature
# also see 'https://themen.ebay-kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
return contact