feat: add hash-based ad change detection (#343) (#388)

Co-authored-by: sebthom <sebthom@users.noreply.github.com>
2026-03-12 10:31:50 +01:00 · 2025-01-26 23:37:33 +01:00
parent 3d27755207
commit f01109c956
5 changed files with 104 additions and 30 deletions
--- a/README.md
+++ b/README.md
@@ -281,7 +281,6 @@ browser:
 login:
  username: ""
  password: ""
 ```
 ### <a name="ad-config"></a>2) Ad configuration
@@ -342,9 +341,11 @@ contact:
 republication_interval: # every X days the ad should be re-published
-id: # set automatically
+# The following fields are automatically managed by the bot:
-created_on: # set automatically
+id: # the ID assigned by kleinanzeigen.de
-updated_on: # set automatically
+created_on: # ISO timestamp when the ad was first published
 updated_on: # ISO timestamp when the ad was last published
 content_hash: # hash of the ad content, used to detect changes
 ```
 ### <a name="existing-browser"></a>3) Using an existing browser window
--- a/src/kleinanzeigen_bot/init.py
+++ b/src/kleinanzeigen_bot/init.py
@@ -19,7 +19,7 @@ from wcmatch import glob
 from . import utils, resources, extract
 from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
-from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime
+from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
 from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
 from ._version import __version__
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
        LOG.info("App version: %s", self.get_version())
        LOG.info("Python version: %s", sys.version)
    def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
        """
        Check if an ad needs to be republished based on changes and republication interval.
        Returns True if the ad should be republished.
        """
        if ad_cfg["updated_on"]:
            last_updated_on = parse_datetime(ad_cfg["updated_on"])
        elif ad_cfg["created_on"]:
            last_updated_on = parse_datetime(ad_cfg["created_on"])
        else:
            return True
        if not last_updated_on:
            return True
        # Check for changes first
        if ad_cfg["id"]:
            current_hash = calculate_content_hash(ad_cfg)
            stored_hash = ad_cfg_orig.get("content_hash")
            LOG.debug("Hash comparison for [%s]:", ad_file_relative)
            LOG.debug("    Stored hash: %s", stored_hash)
            LOG.debug("    Current hash: %s", current_hash)
            if stored_hash and current_hash == stored_hash:
                # No changes - check republication interval
                ad_age = datetime.utcnow() - last_updated_on
                if ad_age.days <= ad_cfg["republication_interval"]:
                    LOG.info(
                        " -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
                        ad_file_relative,
                        ad_age.days,
                        ad_cfg["republication_interval"]
                    )
                    return False
            else:
                LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
                # Update hash in original configuration
                ad_cfg_orig["content_hash"] = current_hash
                return True
        return True
    def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
        LOG.info("Searching for ad config files...")
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
        if not ad_files:
            return []
-        descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or ""
+        description_config = {
-        descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or ""
+            "prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
            "suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
        }
        ids = []
        use_specific_ads = False
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
                    continue
                if self.ads_selector == "due":
-                    if ad_cfg["updated_on"]:
+                    if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
-                        last_updated_on = parse_datetime(ad_cfg["updated_on"])
+                        continue
                    elif ad_cfg["created_on"]:
                        last_updated_on = parse_datetime(ad_cfg["created_on"])
                    else:
                        last_updated_on = None
-                    if last_updated_on:
+            ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
                        ad_age = datetime.utcnow() - last_updated_on
                        if ad_age.days <= ad_cfg["republication_interval"]:
                            LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
                                ad_file_relative,
                                ad_age.days,
                                ad_cfg["republication_interval"]
                            )
                            continue
            ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
            ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
            ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
        await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
        ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
        if not ad_cfg["created_on"] and not ad_cfg["id"]:
            ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
        # extract the ad id from the URL's query parameter
        current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
        ad_id = int(current_url_query_params.get("adId", [])[0])
        ad_cfg_orig["id"] = ad_id
        # Update content hash after successful publication
        ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
        ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
        if not ad_cfg["created_on"] and not ad_cfg["id"]:
            ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
        LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
        utils.save_dict(ad_file, ad_cfg_orig)
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -11,7 +11,7 @@ from typing import Any, Final
 import json
 from .i18n import get_translating_logger, pluralize
-from .utils import is_integer, parse_decimal, save_dict
+from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
 from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
 __all__ = [
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
        info['created_on'] = creation_date
        info['updated_on'] = None  # will be set later on
        # Calculate the initial hash for the downloaded ad
        info['content_hash'] = calculate_content_hash(info)
        return info
    async def _extract_category_from_ad_page(self) -> str:
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
    'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
    'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
  __check_ad_republication:
    "Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
    "    Stored hash: %s": "    Gespeicherter Hash: %s"
    "    Current hash: %s": "    Aktueller Hash: %s"
    "Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
 #################################################
 kleinanzeigen_bot/extract.py:
--- a/src/kleinanzeigen_bot/utils.py
+++ b/src/kleinanzeigen_bot/utils.py
@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
 """
-import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time
+import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
 from importlib.resources import read_text as get_resource_as_string
 from collections.abc import Callable
 from datetime import datetime
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
    if isinstance(date, datetime):
        return date
    return datetime.fromisoformat(date)
 def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
    """Calculate a hash for user-modifiable fields of the ad."""
    # Relevant fields for the hash
    content = {
        "active": bool(ad_cfg.get("active", True)),  # Explicitly convert to bool
        "type": str(ad_cfg.get("type", "")),  # Explicitly convert to string
        "title": str(ad_cfg.get("title", "")),
        "description": str(ad_cfg.get("description", "")),
        "category": str(ad_cfg.get("category", "")),
        "price": str(ad_cfg.get("price", "")),  # Price always as string
        "price_type": str(ad_cfg.get("price_type", "")),
        "special_attributes": dict(ad_cfg.get("special_attributes", {})),  # Copy the dict
        "shipping_type": str(ad_cfg.get("shipping_type", "")),
        "shipping_costs": str(ad_cfg.get("shipping_costs", "")),
        "shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]),  # Convert to list and sort
        "sell_directly": bool(ad_cfg.get("sell_directly", False)),  # Explicitly convert to bool
        "images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]),  # Only filenames
        "contact": {
            "name": str(ad_cfg.get("contact", {}).get("name", "")),
            "street": str(ad_cfg.get("contact", {}).get("street", "None")),  # Explicitly "None" as string for None values
            "zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
            "phone": str(ad_cfg.get("contact", {}).get("phone", ""))
        }
    }
    # Create sorted JSON string for consistent hashes
    content_str = json.dumps(content, sort_keys=True)
    return hashlib.sha256(content_str.encode()).hexdigest()