From f01109c956042dea5f0b112aeac7400a8cc6ae8d Mon Sep 17 00:00:00 2001
From: 1cu <1742418+1cu@users.noreply.github.com>
Date: Sun, 26 Jan 2025 23:37:33 +0100
Subject: [PATCH] feat: add hash-based ad change detection (#343) (#388)

Co-authored-by: sebthom <sebthom@users.noreply.github.com>
---
 README.md                                     |  9 ++-
 src/kleinanzeigen_bot/__init__.py             | 81 +++++++++++++------
 src/kleinanzeigen_bot/extract.py              |  5 +-
 .../resources/translations.de.yaml            |  6 ++
 src/kleinanzeigen_bot/utils.py                | 33 +++++++-
 5 files changed, 104 insertions(+), 30 deletions(-)
diff --git a/README.md b/README.md
index d8c1c0c..d61a812 100644
--- a/README.md
+++ b/README.md
@@ -281,7 +281,6 @@ browser:
 login:
   username: ""
   password: ""
-
 ```
 
 ### <a name="ad-config"></a>2) Ad configuration
@@ -342,9 +341,11 @@ contact:
 
 republication_interval: # every X days the ad should be re-published
 
-id: # set automatically
-created_on: # set automatically
-updated_on: # set automatically
+# The following fields are automatically managed by the bot:
+id: # the ID assigned by kleinanzeigen.de
+created_on: # ISO timestamp when the ad was first published
+updated_on: # ISO timestamp when the ad was last published
+content_hash: # hash of the ad content, used to detect changes
 ```
 
 ### <a name="existing-browser"></a>3) Using an existing browser window
diff --git a/src/kleinanzeigen_bot/__init__.py b/src/kleinanzeigen_bot/__init__.py
index d2e8889..ee325c0 100644
--- a/src/kleinanzeigen_bot/__init__.py
+++ b/src/kleinanzeigen_bot/__init__.py
@@ -19,7 +19,7 @@ from wcmatch import glob
 
 from . import utils, resources, extract
 from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
-from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime
+from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
 from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
 from ._version import __version__
 
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
         LOG.info("App version: %s", self.get_version())
         LOG.info("Python version: %s", sys.version)
 
+    def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
+        """
+        Check if an ad needs to be republished based on changes and republication interval.
+        Returns True if the ad should be republished.
+        """
+        if ad_cfg["updated_on"]:
+            last_updated_on = parse_datetime(ad_cfg["updated_on"])
+        elif ad_cfg["created_on"]:
+            last_updated_on = parse_datetime(ad_cfg["created_on"])
+        else:
+            return True
+
+        if not last_updated_on:
+            return True
+
+        # Check for changes first
+        if ad_cfg["id"]:
+            current_hash = calculate_content_hash(ad_cfg)
+            stored_hash = ad_cfg_orig.get("content_hash")
+
+            LOG.debug("Hash comparison for [%s]:", ad_file_relative)
+            LOG.debug("    Stored hash: %s", stored_hash)
+            LOG.debug("    Current hash: %s", current_hash)
+
+            if stored_hash and current_hash == stored_hash:
+                # No changes - check republication interval
+                ad_age = datetime.utcnow() - last_updated_on
+                if ad_age.days <= ad_cfg["republication_interval"]:
+                    LOG.info(
+                        " -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
+                        ad_file_relative,
+                        ad_age.days,
+                        ad_cfg["republication_interval"]
+                    )
+                    return False
+            else:
+                LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
+                # Update hash in original configuration
+                ad_cfg_orig["content_hash"] = current_hash
+                return True
+
+        return True
+
     def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
         LOG.info("Searching for ad config files...")
 
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
         if not ad_files:
             return []
 
-        descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or ""
-        descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or ""
+        description_config = {
+            "prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
+            "suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
+        }
 
         ids = []
         use_specific_ads = False
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
                     continue
 
                 if self.ads_selector == "due":
-                    if ad_cfg["updated_on"]:
-                        last_updated_on = parse_datetime(ad_cfg["updated_on"])
-                    elif ad_cfg["created_on"]:
-                        last_updated_on = parse_datetime(ad_cfg["created_on"])
-                    else:
-                        last_updated_on = None
+                    if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
+                        continue
 
-                    if last_updated_on:
-                        ad_age = datetime.utcnow() - last_updated_on
-                        if ad_age.days <= ad_cfg["republication_interval"]:
-                            LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
-                                ad_file_relative,
-                                ad_age.days,
-                                ad_cfg["republication_interval"]
-                            )
-                            continue
-
-            ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
+            ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
             ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
             ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
 
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
 
         await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
 
-        ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
-        if not ad_cfg["created_on"] and not ad_cfg["id"]:
-            ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
-
         # extract the ad id from the URL's query parameter
         current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
         ad_id = int(current_url_query_params.get("adId", [])[0])
         ad_cfg_orig["id"] = ad_id
 
+        # Update content hash after successful publication
+        ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
+        ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
+        if not ad_cfg["created_on"] and not ad_cfg["id"]:
+            ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
+
         LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
 
         utils.save_dict(ad_file, ad_cfg_orig)
diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py
index d08f31e..7d87df9 100644
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -11,7 +11,7 @@ from typing import Any, Final
 import json
 
 from .i18n import get_translating_logger, pluralize
-from .utils import is_integer, parse_decimal, save_dict
+from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
 from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
 
 __all__ = [
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
         info['created_on'] = creation_date
         info['updated_on'] = None  # will be set later on
 
+        # Calculate the initial hash for the downloaded ad
+        info['content_hash'] = calculate_content_hash(info)
+
         return info
 
     async def _extract_category_from_ad_page(self) -> str:
diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml
index 1b7426a..3f38417 100644
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
     'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
     'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
 
+  __check_ad_republication:
+    "Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
+    "    Stored hash: %s": "    Gespeicherter Hash: %s"
+    "    Current hash: %s": "    Aktueller Hash: %s"
+    "Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
+
 
 #################################################
 kleinanzeigen_bot/extract.py:
diff --git a/src/kleinanzeigen_bot/utils.py b/src/kleinanzeigen_bot/utils.py
index e525471..e177ef6 100644
--- a/src/kleinanzeigen_bot/utils.py
+++ b/src/kleinanzeigen_bot/utils.py
@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
 SPDX-License-Identifier: AGPL-3.0-or-later
 SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
 """
-import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time
+import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
 from importlib.resources import read_text as get_resource_as_string
 from collections.abc import Callable
 from datetime import datetime
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
     if isinstance(date, datetime):
         return date
     return datetime.fromisoformat(date)
+
+
+def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
+    """Calculate a hash for user-modifiable fields of the ad."""
+
+    # Relevant fields for the hash
+    content = {
+        "active": bool(ad_cfg.get("active", True)),  # Explicitly convert to bool
+        "type": str(ad_cfg.get("type", "")),  # Explicitly convert to string
+        "title": str(ad_cfg.get("title", "")),
+        "description": str(ad_cfg.get("description", "")),
+        "category": str(ad_cfg.get("category", "")),
+        "price": str(ad_cfg.get("price", "")),  # Price always as string
+        "price_type": str(ad_cfg.get("price_type", "")),
+        "special_attributes": dict(ad_cfg.get("special_attributes", {})),  # Copy the dict
+        "shipping_type": str(ad_cfg.get("shipping_type", "")),
+        "shipping_costs": str(ad_cfg.get("shipping_costs", "")),
+        "shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]),  # Convert to list and sort
+        "sell_directly": bool(ad_cfg.get("sell_directly", False)),  # Explicitly convert to bool
+        "images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]),  # Only filenames
+        "contact": {
+            "name": str(ad_cfg.get("contact", {}).get("name", "")),
+            "street": str(ad_cfg.get("contact", {}).get("street", "None")),  # Explicitly "None" as string for None values
+            "zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
+            "phone": str(ad_cfg.get("contact", {}).get("phone", ""))
+        }
+    }
+
+    # Create sorted JSON string for consistent hashes
+    content_str = json.dumps(content, sort_keys=True)
+    return hashlib.sha256(content_str.encode()).hexdigest()