feat: add hash-based ad change detection (#343) (#388)

Co-authored-by: sebthom <sebthom@users.noreply.github.com>
This commit is contained in:
1cu
2025-01-26 23:37:33 +01:00
committed by GitHub
parent 3d27755207
commit f01109c956
5 changed files with 104 additions and 30 deletions

View File

@@ -281,7 +281,6 @@ browser:
login: login:
username: "" username: ""
password: "" password: ""
``` ```
### <a name="ad-config"></a>2) Ad configuration ### <a name="ad-config"></a>2) Ad configuration
@@ -342,9 +341,11 @@ contact:
republication_interval: # every X days the ad should be re-published republication_interval: # every X days the ad should be re-published
id: # set automatically # The following fields are automatically managed by the bot:
created_on: # set automatically id: # the ID assigned by kleinanzeigen.de
updated_on: # set automatically created_on: # ISO timestamp when the ad was first published
updated_on: # ISO timestamp when the ad was last published
content_hash: # hash of the ad content, used to detect changes
``` ```
### <a name="existing-browser"></a>3) Using an existing browser window ### <a name="existing-browser"></a>3) Using an existing browser window

View File

@@ -19,7 +19,7 @@ from wcmatch import glob
from . import utils, resources, extract from . import utils, resources, extract
from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
from ._version import __version__ from ._version import __version__
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
LOG.info("App version: %s", self.get_version()) LOG.info("App version: %s", self.get_version())
LOG.info("Python version: %s", sys.version) LOG.info("Python version: %s", sys.version)
def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
"""
Check if an ad needs to be republished based on changes and republication interval.
Returns True if the ad should be republished.
"""
if ad_cfg["updated_on"]:
last_updated_on = parse_datetime(ad_cfg["updated_on"])
elif ad_cfg["created_on"]:
last_updated_on = parse_datetime(ad_cfg["created_on"])
else:
return True
if not last_updated_on:
return True
# Check for changes first
if ad_cfg["id"]:
current_hash = calculate_content_hash(ad_cfg)
stored_hash = ad_cfg_orig.get("content_hash")
LOG.debug("Hash comparison for [%s]:", ad_file_relative)
LOG.debug(" Stored hash: %s", stored_hash)
LOG.debug(" Current hash: %s", current_hash)
if stored_hash and current_hash == stored_hash:
# No changes - check republication interval
ad_age = datetime.utcnow() - last_updated_on
if ad_age.days <= ad_cfg["republication_interval"]:
LOG.info(
" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
ad_file_relative,
ad_age.days,
ad_cfg["republication_interval"]
)
return False
else:
LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
# Update hash in original configuration
ad_cfg_orig["content_hash"] = current_hash
return True
return True
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]: def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
LOG.info("Searching for ad config files...") LOG.info("Searching for ad config files...")
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
if not ad_files: if not ad_files:
return [] return []
descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or "" description_config = {
descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or "" "prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
"suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
}
ids = [] ids = []
use_specific_ads = False use_specific_ads = False
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
continue continue
if self.ads_selector == "due": if self.ads_selector == "due":
if ad_cfg["updated_on"]: if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
last_updated_on = parse_datetime(ad_cfg["updated_on"]) continue
elif ad_cfg["created_on"]:
last_updated_on = parse_datetime(ad_cfg["created_on"])
else:
last_updated_on = None
if last_updated_on: ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
ad_age = datetime.utcnow() - last_updated_on
if ad_age.days <= ad_cfg["republication_interval"]:
LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
ad_file_relative,
ad_age.days,
ad_cfg["republication_interval"]
)
continue
ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)") ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]") ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20) await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
if not ad_cfg["created_on"] and not ad_cfg["id"]:
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
# extract the ad id from the URL's query parameter # extract the ad id from the URL's query parameter
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query) current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
ad_id = int(current_url_query_params.get("adId", [])[0]) ad_id = int(current_url_query_params.get("adId", [])[0])
ad_cfg_orig["id"] = ad_id ad_cfg_orig["id"] = ad_id
# Update content hash after successful publication
ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
if not ad_cfg["created_on"] and not ad_cfg["id"]:
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
LOG.info(" -> SUCCESS: ad published with ID %s", ad_id) LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
utils.save_dict(ad_file, ad_cfg_orig) utils.save_dict(ad_file, ad_cfg_orig)

View File

@@ -11,7 +11,7 @@ from typing import Any, Final
import json import json
from .i18n import get_translating_logger, pluralize from .i18n import get_translating_logger, pluralize
from .utils import is_integer, parse_decimal, save_dict from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
__all__ = [ __all__ = [
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
info['created_on'] = creation_date info['created_on'] = creation_date
info['updated_on'] = None # will be set later on info['updated_on'] = None # will be set later on
# Calculate the initial hash for the downloaded ad
info['content_hash'] = calculate_content_hash(info)
return info return info
async def _extract_category_from_ad_page(self) -> str: async def _extract_category_from_ad_page(self) -> str:

View File

@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen' 'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!' 'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
__check_ad_republication:
"Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
" Stored hash: %s": " Gespeicherter Hash: %s"
" Current hash: %s": " Aktueller Hash: %s"
"Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
################################################# #################################################
kleinanzeigen_bot/extract.py: kleinanzeigen_bot/extract.py:

View File

@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
SPDX-License-Identifier: AGPL-3.0-or-later SPDX-License-Identifier: AGPL-3.0-or-later
SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/ SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
""" """
import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
from importlib.resources import read_text as get_resource_as_string from importlib.resources import read_text as get_resource_as_string
from collections.abc import Callable from collections.abc import Callable
from datetime import datetime from datetime import datetime
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
if isinstance(date, datetime): if isinstance(date, datetime):
return date return date
return datetime.fromisoformat(date) return datetime.fromisoformat(date)
def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
"""Calculate a hash for user-modifiable fields of the ad."""
# Relevant fields for the hash
content = {
"active": bool(ad_cfg.get("active", True)), # Explicitly convert to bool
"type": str(ad_cfg.get("type", "")), # Explicitly convert to string
"title": str(ad_cfg.get("title", "")),
"description": str(ad_cfg.get("description", "")),
"category": str(ad_cfg.get("category", "")),
"price": str(ad_cfg.get("price", "")), # Price always as string
"price_type": str(ad_cfg.get("price_type", "")),
"special_attributes": dict(ad_cfg.get("special_attributes", {})), # Copy the dict
"shipping_type": str(ad_cfg.get("shipping_type", "")),
"shipping_costs": str(ad_cfg.get("shipping_costs", "")),
"shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]), # Convert to list and sort
"sell_directly": bool(ad_cfg.get("sell_directly", False)), # Explicitly convert to bool
"images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]), # Only filenames
"contact": {
"name": str(ad_cfg.get("contact", {}).get("name", "")),
"street": str(ad_cfg.get("contact", {}).get("street", "None")), # Explicitly "None" as string for None values
"zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
"phone": str(ad_cfg.get("contact", {}).get("phone", ""))
}
}
# Create sorted JSON string for consistent hashes
content_str = json.dumps(content, sort_keys=True)
return hashlib.sha256(content_str.encode()).hexdigest()