use venv

2026-03-12 10:31:50 +01:00 · 2024-03-04 10:07:47 +01:00
parent 284c6d2bb4
commit 9caa7a7124
15 changed files with 15 additions and 18 deletions
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -0,0 +1,238 @@
+"""
+SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
+SPDX-License-Identifier: AGPL-3.0-or-later
+SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
+"""
+import json
+from decimal import DecimalException
+from typing import Any
+
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webdriver import WebDriver
+import selenium.webdriver.support.expected_conditions as EC
+
+from .selenium_mixin import SeleniumMixin
+from .utils import parse_decimal, pause
+
+
+class AdExtractor(SeleniumMixin):
+    """
+    Wrapper class for ad extraction that uses an active bot´s web driver to extract specific elements from an ad page.
+    """
+
+    def __init__(self, driver:WebDriver):
+        super().__init__()
+        self.webdriver = driver
+
+    def extract_category_from_ad_page(self) -> str:
+        """
+        Extracts a category of an ad in numerical form.
+        Assumes that the web driver currently shows an ad page.
+
+        :return: a category string of form abc/def, where a-f are digits
+        """
+        category_line = self.webdriver.find_element(By.XPATH, '//*[@id="vap-brdcrmb"]')
+        category_first_part = category_line.find_element(By.XPATH, './/a[2]')
+        category_second_part = category_line.find_element(By.XPATH, './/a[3]')
+        cat_num_first = category_first_part.get_attribute('href').split('/')[-1][1:]
+        cat_num_second = category_second_part.get_attribute('href').split('/')[-1][1:]
+        category:str = cat_num_first + '/' + cat_num_second
+
+        return category
+
+    def extract_special_attributes_from_ad_page(self) -> dict[str, Any]:
+        """
+        Extracts the special attributes from an ad page.
+
+        :return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
+        """
+        belen_conf = self.webdriver.execute_script("return window.BelenConf")
+        special_attributes_str = belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension108"]
+        special_attributes = json.loads(special_attributes_str)
+        if not isinstance(special_attributes, dict):
+            raise ValueError(
+                "Failed to parse special attributes from ad page."
+                f"Expected a dictionary, but got a {type(special_attributes)}"
+            )
+        special_attributes = {k: v for k, v in special_attributes.items() if not k.endswith('.versand_s')}
+        return special_attributes
+
+    def extract_pricing_info_from_ad_page(self) -> tuple[float | None, str]:
+        """
+        Extracts the pricing information (price and pricing type) from an ad page.
+
+        :return: the price of the offer (optional); and the pricing type
+        """
+        try:
+            price_str:str = self.webdriver.find_element(By.CLASS_NAME, 'boxedarticle--price').text
+            price_type:str
+            price:float | None = -1
+            match price_str.split()[-1]:
+                case '€':
+                    price_type = 'FIXED'
+                    price = float(parse_decimal(price_str.split()[0].replace('.', '')))
+                case 'VB':  # can be either 'X € VB', or just 'VB'
+                    price_type = 'NEGOTIABLE'
+                    try:
+                        price = float(parse_decimal(price_str.split()[0].replace('.', '')))
+                    except DecimalException:
+                        price = None
+                case 'verschenken':
+                    price_type = 'GIVE_AWAY'
+                    price = None
+                case _:
+                    price_type = 'NOT_APPLICABLE'
+            return price, price_type
+        except NoSuchElementException:  # no 'commercial' ad, has no pricing box etc.
+            return None, 'NOT_APPLICABLE'
+
+    def extract_shipping_info_from_ad_page(self) -> tuple[str, float | None, list[str] | None]:
+        """
+        Extracts shipping information from an ad page.
+
+        :return: the shipping type, and the shipping price (optional)
+        """
+        ship_type, ship_costs, shipping_options = 'NOT_APPLICABLE', None, None
+        try:
+            shipping_text = self.webdriver.find_element(By.CSS_SELECTOR, '.boxedarticle--details--shipping') \
+                .text.strip()
+            # e.g. '+ Versand ab 5,49 €' OR 'Nur Abholung'
+            if shipping_text == 'Nur Abholung':
+                ship_type = 'PICKUP'
+            elif shipping_text == 'Versand möglich':
+                ship_type = 'SHIPPING'
+            elif '€' in shipping_text:
+                shipping_price_parts = shipping_text.split(' ')
+                ship_type = 'SHIPPING'
+                ship_costs = float(parse_decimal(shipping_price_parts[-2]))
+
+                # extract shipping options
+                # It is only possible the extract the cheapest shipping option,
+                # as the other options are not shown
+                shipping_option_mapping = {
+                    "DHL_2": "5,49",
+                    "Hermes_Päckchen": "4,50",
+                    "Hermes_S": "4,95",
+                    "DHL_5": "6,99",
+                    "Hermes_M": "5,95",
+                    "DHL_10": "9,49",
+                    "DHL_31,5": "16,49",
+                    "Hermes_L": "10,95",
+                }
+                for shipping_option, shipping_price in shipping_option_mapping.items():
+                    if shipping_price in shipping_text:
+                        shipping_options = [shipping_option]
+                        break
+        except NoSuchElementException:  # no pricing box -> no shipping given
+            ship_type = 'NOT_APPLICABLE'
+
+        return ship_type, ship_costs, shipping_options
+
+    def extract_sell_directly_from_ad_page(self) -> bool | None:
+        """
+        Extracts the sell directly option from an ad page.
+
+        :return: a boolean indicating whether the sell directly option is active (optional)
+        """
+        try:
+            buy_now_is_active = self.webdriver.find_element(By.ID, 'j-buy-now').text == "Direkt kaufen"
+            return buy_now_is_active
+        except NoSuchElementException:
+            return None
+
+    def extract_contact_from_ad_page(self) -> dict[str, (str | None)]:
+        """
+        Processes the address part involving street (optional), zip code + city, and phone number (optional).
+
+        :return: a dictionary containing the address parts with their corresponding values
+        """
+        contact:dict[str, (str | None)] = {}
+        address_element = self.webdriver.find_element(By.CSS_SELECTOR, '#viewad-locality')
+        address_text = address_element.text.strip()
+        # format: e.g. (Beispiel Allee 42,) 12345 Bundesland - Stadt
+        try:
+            street_element = self.webdriver.find_element(By.XPATH, '//*[@id="street-address"]')
+            street = street_element.text[:-2]  # trailing comma and whitespace
+            contact['street'] = street
+        except NoSuchElementException:
+            print('No street given in the contact.')
+        # construct remaining address
+        address_halves = address_text.split(' - ')
+        address_left_parts = address_halves[0].split(' ')  # zip code and region/city
+        contact['zipcode'] = address_left_parts[0]
+
+        contact_person_element = self.webdriver.find_element(By.CSS_SELECTOR, '#viewad-contact')
+        name_element = contact_person_element.find_element(By.CLASS_NAME, 'iconlist-text')
+        try:
+            name = name_element.find_element(By.TAG_NAME, 'a').text
+        except NoSuchElementException:  # edge case: name without link
+            name = name_element.find_element(By.TAG_NAME, 'span').text
+        contact['name'] = name
+
+        if 'street' not in contact:
+            contact['street'] = None
+        try:  # phone number is unusual for non-professional sellers today
+            phone_element = self.webdriver.find_element(By.CSS_SELECTOR, '#viewad-contact-phone')
+            phone_number = phone_element.find_element(By.TAG_NAME, 'a').text
+            contact['phone'] = ''.join(phone_number.replace('-', ' ').split(' ')).replace('+49(0)', '0')
+        except NoSuchElementException:
+            contact['phone'] = None  # phone seems to be a deprecated feature (for non-professional users)
+        # also see 'https://themen.kleinanzeigen.de/hilfe/deine-anzeigen/Telefon/
+
+        return contact
+
+    def extract_own_ads_references(self) -> list[str]:
+        """
+        Extracts the references to all own ads.
+
+        :return: the links to your ad pages
+        """
+        # navigate to your ads page
+        self.webdriver.get('https://www.kleinanzeigen.de/m-meine-anzeigen.html')
+        self.web_await(EC.url_contains('meine-anzeigen'), 15)
+        pause(2000, 3000)
+
+        # collect ad references:
+
+        pagination_section = self.webdriver.find_element(By.CSS_SELECTOR, '.l-splitpage')\
+            .find_element(By.XPATH, './/section[4]')
+        # scroll down to load dynamically
+        self.web_scroll_page_down()
+        pause(2000, 3000)
+        # detect multi-page
+        try:
+            pagination = pagination_section.find_element(By.XPATH, './/div/div[2]/div[2]/div')  # Pagination
+        except NoSuchElementException:  # 0 ads - no pagination area
+            print('There currently seem to be no ads on your profile!')
+            return []
+
+        n_buttons = len(pagination.find_element(By.XPATH, './/div[1]').find_elements(By.TAG_NAME, 'button'))
+        multi_page:bool
+        if n_buttons > 1:
+            multi_page = True
+            print('It seems like you have many ads!')
+        else:
+            multi_page = False
+            print('It seems like all your ads fit on one overview page.')
+
+        refs:list[str] = []
+        while True:  # loop reference extraction until no more forward page
+            # extract references
+            list_section = self.webdriver.find_element(By.XPATH, '//*[@id="my-manageads-adlist"]')
+            list_items = list_section.find_elements(By.CLASS_NAME, 'cardbox')
+            refs += [li.find_element(By.XPATH, 'article/section/section[2]/h2/div/a').get_attribute('href') for li in list_items]
+
+            if not multi_page:  # only one iteration for single-page overview
+                break
+            # check if last page
+            nav_button = self.webdriver.find_elements(By.CSS_SELECTOR, 'button.jsx-2828608826')[-1]
+            if nav_button.get_attribute('title') != 'Nächste':
+                print('Last ad overview page explored.')
+                break
+            # navigate to next overview page
+            nav_button.click()
+            pause(2000, 3000)
+            self.web_scroll_page_down()
+
+        return refs