From f01109c956042dea5f0b112aeac7400a8cc6ae8d Mon Sep 17 00:00:00 2001 From: 1cu <1742418+1cu@users.noreply.github.com> Date: Sun, 26 Jan 2025 23:37:33 +0100 Subject: [PATCH] feat: add hash-based ad change detection (#343) (#388) Co-authored-by: sebthom --- README.md | 9 ++- src/kleinanzeigen_bot/__init__.py | 81 +++++++++++++------ src/kleinanzeigen_bot/extract.py | 5 +- .../resources/translations.de.yaml | 6 ++ src/kleinanzeigen_bot/utils.py | 33 +++++++- 5 files changed, 104 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index d8c1c0c..d61a812 100644 --- a/README.md +++ b/README.md @@ -281,7 +281,6 @@ browser: login: username: "" password: "" - ``` ### 2) Ad configuration @@ -342,9 +341,11 @@ contact: republication_interval: # every X days the ad should be re-published -id: # set automatically -created_on: # set automatically -updated_on: # set automatically +# The following fields are automatically managed by the bot: +id: # the ID assigned by kleinanzeigen.de +created_on: # ISO timestamp when the ad was first published +updated_on: # ISO timestamp when the ad was last published +content_hash: # hash of the ad content, used to detect changes ``` ### 3) Using an existing browser window diff --git a/src/kleinanzeigen_bot/__init__.py b/src/kleinanzeigen_bot/__init__.py index d2e8889..ee325c0 100644 --- a/src/kleinanzeigen_bot/__init__.py +++ b/src/kleinanzeigen_bot/__init__.py @@ -19,7 +19,7 @@ from wcmatch import glob from . import utils, resources, extract from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize -from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime +from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin from ._version import __version__ @@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin): LOG.info("App version: %s", self.get_version()) LOG.info("Python version: %s", sys.version) + def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool: + """ + Check if an ad needs to be republished based on changes and republication interval. + Returns True if the ad should be republished. + """ + if ad_cfg["updated_on"]: + last_updated_on = parse_datetime(ad_cfg["updated_on"]) + elif ad_cfg["created_on"]: + last_updated_on = parse_datetime(ad_cfg["created_on"]) + else: + return True + + if not last_updated_on: + return True + + # Check for changes first + if ad_cfg["id"]: + current_hash = calculate_content_hash(ad_cfg) + stored_hash = ad_cfg_orig.get("content_hash") + + LOG.debug("Hash comparison for [%s]:", ad_file_relative) + LOG.debug(" Stored hash: %s", stored_hash) + LOG.debug(" Current hash: %s", current_hash) + + if stored_hash and current_hash == stored_hash: + # No changes - check republication interval + ad_age = datetime.utcnow() - last_updated_on + if ad_age.days <= ad_cfg["republication_interval"]: + LOG.info( + " -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days", + ad_file_relative, + ad_age.days, + ad_cfg["republication_interval"] + ) + return False + else: + LOG.info("Changes detected in ad [%s], will republish", ad_file_relative) + # Update hash in original configuration + ad_cfg_orig["content_hash"] = current_hash + return True + + return True + def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]: LOG.info("Searching for ad config files...") @@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin): if not ad_files: return [] - descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or "" - descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or "" + description_config = { + "prefix": self.config["ad_defaults"]["description"]["prefix"] or "", + "suffix": self.config["ad_defaults"]["description"]["suffix"] or "" + } ids = [] use_specific_ads = False @@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin): continue if self.ads_selector == "due": - if ad_cfg["updated_on"]: - last_updated_on = parse_datetime(ad_cfg["updated_on"]) - elif ad_cfg["created_on"]: - last_updated_on = parse_datetime(ad_cfg["created_on"]) - else: - last_updated_on = None + if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative): + continue - if last_updated_on: - ad_age = datetime.utcnow() - last_updated_on - if ad_age.days <= ad_cfg["republication_interval"]: - LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days", - ad_file_relative, - ad_age.days, - ad_cfg["republication_interval"] - ) - continue - - ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix + ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"] ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)") ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]") @@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin): await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20) - ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat() - if not ad_cfg["created_on"] and not ad_cfg["id"]: - ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"] - # extract the ad id from the URL's query parameter current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query) ad_id = int(current_url_query_params.get("adId", [])[0]) ad_cfg_orig["id"] = ad_id + # Update content hash after successful publication + ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg) + ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat() + if not ad_cfg["created_on"] and not ad_cfg["id"]: + ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"] + LOG.info(" -> SUCCESS: ad published with ID %s", ad_id) utils.save_dict(ad_file, ad_cfg_orig) diff --git a/src/kleinanzeigen_bot/extract.py b/src/kleinanzeigen_bot/extract.py index d08f31e..7d87df9 100644 --- a/src/kleinanzeigen_bot/extract.py +++ b/src/kleinanzeigen_bot/extract.py @@ -11,7 +11,7 @@ from typing import Any, Final import json from .i18n import get_translating_logger, pluralize -from .utils import is_integer, parse_decimal, save_dict +from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin __all__ = [ @@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin): info['created_on'] = creation_date info['updated_on'] = None # will be set later on + # Calculate the initial hash for the downloaded ad + info['content_hash'] = calculate_content_hash(info) + return info async def _extract_category_from_ad_page(self) -> str: diff --git a/src/kleinanzeigen_bot/resources/translations.de.yaml b/src/kleinanzeigen_bot/resources/translations.de.yaml index 1b7426a..3f38417 100644 --- a/src/kleinanzeigen_bot/resources/translations.de.yaml +++ b/src/kleinanzeigen_bot/resources/translations.de.yaml @@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py: 'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen' 'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!' + __check_ad_republication: + "Hash comparison for [%s]:": "Hash-Vergleich für [%s]:" + " Stored hash: %s": " Gespeicherter Hash: %s" + " Current hash: %s": " Aktueller Hash: %s" + "Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht" + ################################################# kleinanzeigen_bot/extract.py: diff --git a/src/kleinanzeigen_bot/utils.py b/src/kleinanzeigen_bot/utils.py index e525471..e177ef6 100644 --- a/src/kleinanzeigen_bot/utils.py +++ b/src/kleinanzeigen_bot/utils.py @@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors SPDX-License-Identifier: AGPL-3.0-or-later SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ """ -import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time +import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib from importlib.resources import read_text as get_resource_as_string from collections.abc import Callable from datetime import datetime @@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None: if isinstance(date, datetime): return date return datetime.fromisoformat(date) + + +def calculate_content_hash(ad_cfg: dict[str, Any]) -> str: + """Calculate a hash for user-modifiable fields of the ad.""" + + # Relevant fields for the hash + content = { + "active": bool(ad_cfg.get("active", True)), # Explicitly convert to bool + "type": str(ad_cfg.get("type", "")), # Explicitly convert to string + "title": str(ad_cfg.get("title", "")), + "description": str(ad_cfg.get("description", "")), + "category": str(ad_cfg.get("category", "")), + "price": str(ad_cfg.get("price", "")), # Price always as string + "price_type": str(ad_cfg.get("price_type", "")), + "special_attributes": dict(ad_cfg.get("special_attributes", {})), # Copy the dict + "shipping_type": str(ad_cfg.get("shipping_type", "")), + "shipping_costs": str(ad_cfg.get("shipping_costs", "")), + "shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]), # Convert to list and sort + "sell_directly": bool(ad_cfg.get("sell_directly", False)), # Explicitly convert to bool + "images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]), # Only filenames + "contact": { + "name": str(ad_cfg.get("contact", {}).get("name", "")), + "street": str(ad_cfg.get("contact", {}).get("street", "None")), # Explicitly "None" as string for None values + "zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")), + "phone": str(ad_cfg.get("contact", {}).get("phone", "")) + } + } + + # Create sorted JSON string for consistent hashes + content_str = json.dumps(content, sort_keys=True) + return hashlib.sha256(content_str.encode()).hexdigest()