diff --git a/README.md b/README.md
index d8c1c0c..d61a812 100644
--- a/README.md
+++ b/README.md
@@ -281,7 +281,6 @@ browser:
login:
username: ""
password: ""
-
```
### 2) Ad configuration
@@ -342,9 +341,11 @@ contact:
republication_interval: # every X days the ad should be re-published
-id: # set automatically
-created_on: # set automatically
-updated_on: # set automatically
+# The following fields are automatically managed by the bot:
+id: # the ID assigned by kleinanzeigen.de
+created_on: # ISO timestamp when the ad was first published
+updated_on: # ISO timestamp when the ad was last published
+content_hash: # hash of the ad content, used to detect changes
```
### 3) Using an existing browser window
diff --git a/src/kleinanzeigen_bot/__init__.py b/src/kleinanzeigen_bot/__init__.py
index d2e8889..ee325c0 100644
--- a/src/kleinanzeigen_bot/__init__.py
+++ b/src/kleinanzeigen_bot/__init__.py
@@ -19,7 +19,7 @@ from wcmatch import glob
from . import utils, resources, extract
from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
-from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime
+from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
from ._version import __version__
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
LOG.info("App version: %s", self.get_version())
LOG.info("Python version: %s", sys.version)
+ def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
+ """
+ Check if an ad needs to be republished based on changes and republication interval.
+ Returns True if the ad should be republished.
+ """
+ if ad_cfg["updated_on"]:
+ last_updated_on = parse_datetime(ad_cfg["updated_on"])
+ elif ad_cfg["created_on"]:
+ last_updated_on = parse_datetime(ad_cfg["created_on"])
+ else:
+ return True
+
+ if not last_updated_on:
+ return True
+
+ # Check for changes first
+ if ad_cfg["id"]:
+ current_hash = calculate_content_hash(ad_cfg)
+ stored_hash = ad_cfg_orig.get("content_hash")
+
+ LOG.debug("Hash comparison for [%s]:", ad_file_relative)
+ LOG.debug(" Stored hash: %s", stored_hash)
+ LOG.debug(" Current hash: %s", current_hash)
+
+ if stored_hash and current_hash == stored_hash:
+ # No changes - check republication interval
+ ad_age = datetime.utcnow() - last_updated_on
+ if ad_age.days <= ad_cfg["republication_interval"]:
+ LOG.info(
+ " -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
+ ad_file_relative,
+ ad_age.days,
+ ad_cfg["republication_interval"]
+ )
+ return False
+ else:
+ LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
+ # Update hash in original configuration
+ ad_cfg_orig["content_hash"] = current_hash
+ return True
+
+ return True
+
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
LOG.info("Searching for ad config files...")
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
if not ad_files:
return []
- descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or ""
- descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or ""
+ description_config = {
+ "prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
+ "suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
+ }
ids = []
use_specific_ads = False
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
continue
if self.ads_selector == "due":
- if ad_cfg["updated_on"]:
- last_updated_on = parse_datetime(ad_cfg["updated_on"])
- elif ad_cfg["created_on"]:
- last_updated_on = parse_datetime(ad_cfg["created_on"])
- else:
- last_updated_on = None
+ if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
+ continue
- if last_updated_on:
- ad_age = datetime.utcnow() - last_updated_on
- if ad_age.days <= ad_cfg["republication_interval"]:
- LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
- ad_file_relative,
- ad_age.days,
- ad_cfg["republication_interval"]
- )
- continue
-
- ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
+ ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
- ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
- if not ad_cfg["created_on"] and not ad_cfg["id"]:
- ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
-
# extract the ad id from the URL's query parameter
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
ad_id = int(current_url_query_params.get("adId", [])[0])
ad_cfg_orig["id"] = ad_id
+ # Update content hash after successful publication
+ ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
+ ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
+ if not ad_cfg["created_on"] and not ad_cfg["id"]:
+ ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
+
LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
utils.save_dict(ad_file, ad_cfg_orig)
diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py
index d08f31e..7d87df9 100644
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -11,7 +11,7 @@ from typing import Any, Final
import json
from .i18n import get_translating_logger, pluralize
-from .utils import is_integer, parse_decimal, save_dict
+from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
__all__ = [
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
info['created_on'] = creation_date
info['updated_on'] = None # will be set later on
+ # Calculate the initial hash for the downloaded ad
+ info['content_hash'] = calculate_content_hash(info)
+
return info
async def _extract_category_from_ad_page(self) -> str:
diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml
index 1b7426a..3f38417 100644
--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
+ __check_ad_republication:
+ "Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
+ " Stored hash: %s": " Gespeicherter Hash: %s"
+ " Current hash: %s": " Aktueller Hash: %s"
+ "Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
+
#################################################
kleinanzeigen_bot/extract.py:
diff --git a/src/kleinanzeigen_bot/utils.py b/src/kleinanzeigen_bot/utils.py
index e525471..e177ef6 100644
--- a/src/kleinanzeigen_bot/utils.py
+++ b/src/kleinanzeigen_bot/utils.py
@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later
SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
"""
-import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time
+import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
from importlib.resources import read_text as get_resource_as_string
from collections.abc import Callable
from datetime import datetime
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
if isinstance(date, datetime):
return date
return datetime.fromisoformat(date)
+
+
+def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
+ """Calculate a hash for user-modifiable fields of the ad."""
+
+ # Relevant fields for the hash
+ content = {
+ "active": bool(ad_cfg.get("active", True)), # Explicitly convert to bool
+ "type": str(ad_cfg.get("type", "")), # Explicitly convert to string
+ "title": str(ad_cfg.get("title", "")),
+ "description": str(ad_cfg.get("description", "")),
+ "category": str(ad_cfg.get("category", "")),
+ "price": str(ad_cfg.get("price", "")), # Price always as string
+ "price_type": str(ad_cfg.get("price_type", "")),
+ "special_attributes": dict(ad_cfg.get("special_attributes", {})), # Copy the dict
+ "shipping_type": str(ad_cfg.get("shipping_type", "")),
+ "shipping_costs": str(ad_cfg.get("shipping_costs", "")),
+ "shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]), # Convert to list and sort
+ "sell_directly": bool(ad_cfg.get("sell_directly", False)), # Explicitly convert to bool
+ "images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]), # Only filenames
+ "contact": {
+ "name": str(ad_cfg.get("contact", {}).get("name", "")),
+ "street": str(ad_cfg.get("contact", {}).get("street", "None")), # Explicitly "None" as string for None values
+ "zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
+ "phone": str(ad_cfg.get("contact", {}).get("phone", ""))
+ }
+ }
+
+ # Create sorted JSON string for consistent hashes
+ content_str = json.dumps(content, sort_keys=True)
+ return hashlib.sha256(content_str.encode()).hexdigest()