feat: improve content_hash calculation

This commit is contained in:
sebthom
2025-05-15 11:48:57 +02:00
committed by Sebastian Thomschke
parent f1cd597dd8
commit 85a5cf5224
13 changed files with 356 additions and 301 deletions

View File

@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import atexit, copy, json, os, re, signal, sys, textwrap # isort: skip
import atexit, json, os, re, signal, sys, textwrap # isort: skip
import getopt # pylint: disable=deprecated-module
import urllib.parse as urllib_parse
from gettext import gettext as _
@@ -13,8 +13,7 @@ from wcmatch import glob
from . import extract, resources
from ._version import __version__
from .ads import calculate_content_hash, get_description_affixes
from .model.ad_model import MAX_DESCRIPTION_LENGTH, Ad
from .model.ad_model import MAX_DESCRIPTION_LENGTH, Ad, AdPartial
from .model.config_model import Config
from .utils import dicts, error_handlers, loggers, misc
from .utils.exceptions import CaptchaEncountered
@@ -80,6 +79,16 @@ class KleinanzeigenBot(WebScrapingMixin):
LOG.info("############################################")
LOG.info("DONE: No configuration errors found.")
LOG.info("############################################")
case "update-content-hash":
self.configure_file_logging()
self.load_config()
self.ads_selector = "all"
if ads := self.load_ads(exclude_ads_with_id = False):
self.update_content_hashes(ads)
else:
LOG.info("############################################")
LOG.info("DONE: No active ads found.")
LOG.info("############################################")
case "publish":
self.configure_file_logging()
self.load_config()
@@ -143,6 +152,9 @@ class KleinanzeigenBot(WebScrapingMixin):
verify - Überprüft die Konfigurationsdateien
delete - Löscht Anzeigen
download - Lädt eine oder mehrere Anzeigen herunter
update-content-hash - Berechnet den content_hash aller Anzeigen anhand der aktuellen ad_defaults neu;
nach Änderungen an den config.yaml/ad_defaults verhindert es, dass alle Anzeigen als
"geändert" gelten und neu veröffentlicht werden.
--
help - Zeigt diese Hilfe an (Standardbefehl)
version - Zeigt die Version der Anwendung an
@@ -178,6 +190,8 @@ class KleinanzeigenBot(WebScrapingMixin):
verify - verifies the configuration files
delete - deletes ads
download - downloads one or multiple ads
update-content-hash recalculates each ads content_hash based on the current ad_defaults;
use this after changing config.yaml/ad_defaults to avoid every ad being marked "changed" and republished
--
help - displays this help (default command)
version - displays the application version
@@ -269,9 +283,10 @@ class KleinanzeigenBot(WebScrapingMixin):
def __check_ad_republication(self, ad_cfg:Ad, ad_file_relative:str) -> bool:
"""
Check if an ad needs to be republished based on republication interval.
Returns True if the ad should be republished based on the interval.
Note: This method does not check for content changes. Use __check_ad_changed for that.
Note: This method no longer checks for content changes. Use __check_ad_changed for that.
Returns:
True if the ad should be republished based on the interval.
"""
if ad_cfg.updated_on:
last_updated_on = ad_cfg.updated_on
@@ -299,14 +314,16 @@ class KleinanzeigenBot(WebScrapingMixin):
def __check_ad_changed(self, ad_cfg:Ad, ad_cfg_orig:dict[str, Any], ad_file_relative:str) -> bool:
"""
Check if an ad has been changed since last publication.
Returns True if the ad has been changed.
Returns:
True if the ad has been changed.
"""
if not ad_cfg.id:
# New ads are not considered "changed"
return False
# Calculate hash on original config to match what was stored
current_hash = calculate_content_hash(ad_cfg_orig)
current_hash = AdPartial.model_validate(ad_cfg_orig).update_content_hash().content_hash
stored_hash = ad_cfg_orig.get("content_hash")
LOG.debug("Hash comparison for [%s]:", ad_file_relative)
@@ -321,7 +338,20 @@ class KleinanzeigenBot(WebScrapingMixin):
return False
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, Ad, dict[str, Any]]]:
def load_ads(self, *, ignore_inactive:bool = True, exclude_ads_with_id:bool = True) -> list[tuple[str, Ad, dict[str, Any]]]:
"""
Load and validate all ad config files, optionally filtering out inactive or alreadypublished ads.
Args:
ignore_inactive (bool):
Skip ads with `active=False`.
exclude_ads_with_id (bool):
Skip ads whose raw data already contains an `id`, i.e. was published before.
Returns:
list[tuple[str, Ad, dict[str, Any]]]:
Tuples of (file_path, validated Ad model, original raw data).
"""
LOG.info("Searching for ad config files...")
ad_files:dict[str, str] = {}
@@ -366,9 +396,9 @@ class KleinanzeigenBot(WebScrapingMixin):
should_include = True
# Check for 'new' selector
if "new" in selectors and (not ad_cfg.id or not check_id):
if "new" in selectors and (not ad_cfg.id or not exclude_ads_with_id):
should_include = True
elif "new" in selectors and ad_cfg.id and check_id:
elif "new" in selectors and ad_cfg.id and exclude_ads_with_id:
LOG.info(" -> SKIPPED: ad [%s] is not new. already has an id assigned.", ad_file_relative)
# Check for 'due' selector
@@ -427,13 +457,7 @@ class KleinanzeigenBot(WebScrapingMixin):
return ads
def load_ad(self, ad_cfg_orig:dict[str, Any]) -> Ad:
ad_cfg_merged = dicts.apply_defaults(
target = copy.deepcopy(ad_cfg_orig),
defaults = self.config.ad_defaults.model_dump(),
ignore = lambda k, _: k == "description",
override = lambda _, v: v == "" # noqa: PLC1901 can be simplified to `not v` as an empty string is falsey
)
return Ad.model_validate(ad_cfg_merged)
return AdPartial.model_validate(ad_cfg_orig).to_ad(self.config.ad_defaults)
def load_config(self) -> None:
# write default config.yaml if config file does not exist
@@ -805,7 +829,7 @@ class KleinanzeigenBot(WebScrapingMixin):
# Update content hash after successful publication
# Calculate hash on original config to ensure consistent comparison on restart
ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg_orig)
ad_cfg_orig["content_hash"] = AdPartial.model_validate(ad_cfg_orig).update_content_hash().content_hash
ad_cfg_orig["updated_on"] = misc.now().isoformat(timespec = "seconds")
if not ad_cfg.created_on and not ad_cfg.id:
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
@@ -1052,7 +1076,7 @@ class KleinanzeigenBot(WebScrapingMixin):
elif self.ads_selector == "new": # download only unsaved ads
# check which ads already saved
saved_ad_ids = []
ads = self.load_ads(ignore_inactive = False, check_id = False) # do not skip because of existing IDs
ads = self.load_ads(ignore_inactive = False, exclude_ads_with_id = False) # do not skip because of existing IDs
for ad in ads:
ad_id = int(ad[2]["id"])
saved_ad_ids.append(ad_id)
@@ -1111,7 +1135,7 @@ class KleinanzeigenBot(WebScrapingMixin):
# 1. Direct ad-level prefix
ad_cfg.description_prefix if ad_cfg.description_prefix is not None
# 2. Global prefix from config
else get_description_affixes(self.config, prefix = True)
else self.config.ad_defaults.description_prefix
or "" # Default to empty string if all sources are None
)
@@ -1120,7 +1144,7 @@ class KleinanzeigenBot(WebScrapingMixin):
# 1. Direct ad-level suffix
ad_cfg.description_suffix if ad_cfg.description_suffix is not None
# 2. Global suffix from config
else get_description_affixes(self.config, prefix = False)
else self.config.ad_defaults.description_suffix
or "" # Default to empty string if all sources are None
)
@@ -1137,6 +1161,21 @@ class KleinanzeigenBot(WebScrapingMixin):
return final_description
def update_content_hashes(self, ads:list[tuple[str, Ad, dict[str, Any]]]) -> None:
count = 0
for (ad_file, ad_cfg, ad_cfg_orig) in ads:
LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ads), ad_cfg.title, ad_file)
ad_cfg.update_content_hash()
if ad_cfg.content_hash != ad_cfg_orig["content_hash"]:
count += 1
ad_cfg_orig["content_hash"] = ad_cfg.content_hash
dicts.save_dict(ad_file, ad_cfg_orig)
LOG.info("############################################")
LOG.info("DONE: Updated [content_hash] in %s", pluralize("ad", count))
LOG.info("############################################")
#############################
# main entry point
#############################

View File

@@ -1,84 +0,0 @@
# SPDX-FileCopyrightText: © Jens Bergman and contributors
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
import hashlib, json, os # isort: skip
from typing import Any
from .model.config_model import Config
from .utils.misc import get_attr
def calculate_content_hash(ad_cfg:dict[str, Any]) -> str:
"""Calculate a hash for user-modifiable fields of the ad."""
# Relevant fields for the hash
content = {
"active": bool(get_attr(ad_cfg, "active", default = True)), # Explicitly convert to bool
"type": str(get_attr(ad_cfg, "type", "")), # Explicitly convert to string
"title": str(get_attr(ad_cfg, "title", "")),
"description": str(get_attr(ad_cfg, "description", "")),
"category": str(get_attr(ad_cfg, "category", "")),
"price": str(get_attr(ad_cfg, "price", "")), # Price always as string
"price_type": str(get_attr(ad_cfg, "price_type", "")),
"special_attributes": dict(get_attr(ad_cfg, "special_attributes", {})), # Handle None case
"shipping_type": str(get_attr(ad_cfg, "shipping_type", "")),
"shipping_costs": str(get_attr(ad_cfg, "shipping_costs", "")),
"shipping_options": sorted([str(x) for x in get_attr(ad_cfg, "shipping_options", [])]), # Handle None case
"sell_directly": bool(get_attr(ad_cfg, "sell_directly", default = False)), # Explicitly convert to bool
"images": sorted([os.path.basename(str(img)) if img is not None else "" for img in get_attr(ad_cfg, "images", [])]), # Handle None values in images
"contact": {
"name": str(get_attr(ad_cfg, "contact.name", "")),
"street": str(get_attr(ad_cfg, "contact.street", "")), # Changed from "None" to empty string for consistency
"zipcode": str(get_attr(ad_cfg, "contact.zipcode", "")),
"phone": str(get_attr(ad_cfg, "contact.phone", ""))
}
}
# Create sorted JSON string for consistent hashes
content_str = json.dumps(content, sort_keys = True)
return hashlib.sha256(content_str.encode()).hexdigest()
def get_description_affixes(config:Config, *, prefix:bool = True) -> str:
"""Get prefix or suffix for description with proper precedence.
This function handles both the new flattened format and legacy nested format:
New format (flattened):
ad_defaults:
description_prefix: "Global Prefix"
description_suffix: "Global Suffix"
Legacy format (nested):
ad_defaults:
description:
prefix: "Legacy Prefix"
suffix: "Legacy Suffix"
Args:
config: Configuration dictionary containing ad_defaults
prefix: If True, get prefix, otherwise get suffix
Returns:
The appropriate affix string, empty string if none found
Example:
>>> config = {"ad_defaults": {"description_prefix": "Hello", "description": {"prefix": "Hi"}}}
>>> get_description_affixes(Config.model_validate(config), prefix=True)
'Hello'
"""
affix_type = "prefix" if prefix else "suffix"
# First try new flattened format (description_prefix/description_suffix)
flattened_key = f"description_{affix_type}"
flattened_value = getattr(config.ad_defaults, flattened_key)
if isinstance(flattened_value, str):
return flattened_value
# Then try legacy nested format (description.prefix/description.suffix)
if config.ad_defaults.description:
nested_value = getattr(config.ad_defaults.description, affix_type)
if isinstance(nested_value, str):
return nested_value
return ""

View File

@@ -8,7 +8,6 @@ from typing import Any, Final
from kleinanzeigen_bot.model.ad_model import ContactPartial
from .ads import calculate_content_hash, get_description_affixes
from .model.ad_model import AdPartial
from .model.config_model import Config
from .utils import dicts, i18n, loggers, misc, reflect
@@ -294,8 +293,8 @@ class AdExtractor(WebScrapingMixin):
raw_description = (await self.web_text(By.ID, "viewad-description-text")).strip()
# Get prefix and suffix from config
prefix = get_description_affixes(self.config, prefix = True)
suffix = get_description_affixes(self.config, prefix = False)
prefix = self.config.ad_defaults.description_prefix
suffix = self.config.ad_defaults.description_suffix
# Remove prefix and suffix if present
description_text = raw_description
@@ -334,10 +333,12 @@ class AdExtractor(WebScrapingMixin):
info["created_on"] = creation_date
info["updated_on"] = None # will be set later on
# Calculate the initial hash for the downloaded ad
info["content_hash"] = calculate_content_hash(info)
ad_cfg = AdPartial.model_validate(info)
return AdPartial.model_validate(info)
# calculate the initial hash for the downloaded ad
ad_cfg.content_hash = ad_cfg.to_ad(self.config.ad_defaults).update_content_hash().content_hash
return ad_cfg
async def _extract_category_from_ad_page(self) -> str:
"""

View File

@@ -3,20 +3,28 @@
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
from __future__ import annotations
import hashlib, json # isort: skip
from datetime import datetime # noqa: TC003 Move import into a type-checking block
from typing import Any, Dict, Final, List, Literal
from typing import Any, Dict, Final, List, Literal, Mapping, Sequence
from pydantic import Field, model_validator, validator
from typing_extensions import Self
from kleinanzeigen_bot.model.config_model import AdDefaults # noqa: TC001 Move application import into a type-checking block
from kleinanzeigen_bot.utils import dicts
from kleinanzeigen_bot.utils.misc import parse_datetime, parse_decimal
from kleinanzeigen_bot.utils.pydantics import ContextualModel
MAX_DESCRIPTION_LENGTH:Final[int] = 4000
def _iso_datetime_field() -> Any:
def _OPTIONAL() -> Any:
return Field(default = None)
def _ISO_DATETIME(default:datetime | None = None) -> Any:
return Field(
default = None,
default = default,
description = "ISO-8601 timestamp with optional timezone (e.g. 2024-12-25T00:00:00 or 2024-12-25T00:00:00Z)",
json_schema_extra = {
"anyOf": [
@@ -36,37 +44,37 @@ def _iso_datetime_field() -> Any:
class ContactPartial(ContextualModel):
name:str | None = None
street:str | None = None
zipcode:int | str | None = None
location:str | None = None
name:str | None = _OPTIONAL()
street:str | None = _OPTIONAL()
zipcode:int | str | None = _OPTIONAL()
location:str | None = _OPTIONAL()
phone:str | None = None
phone:str | None = _OPTIONAL()
class AdPartial(ContextualModel):
active:bool = True
type:Literal["OFFER", "WANTED"] = "OFFER"
active:bool | None = _OPTIONAL()
type:Literal["OFFER", "WANTED"] | None = _OPTIONAL()
title:str = Field(..., min_length = 10)
description:str
description_prefix:str | None = None
description_suffix:str | None = None
description_prefix:str | None = _OPTIONAL()
description_suffix:str | None = _OPTIONAL()
category:str
special_attributes:Dict[str, str] | None = Field(default = None)
price:int | None = None
price_type:Literal["FIXED", "NEGOTIABLE", "GIVE_AWAY", "NOT_APPLICABLE"] = "NEGOTIABLE"
shipping_type:Literal["PICKUP", "SHIPPING", "NOT_APPLICABLE"] = "SHIPPING"
shipping_costs:float | None = None
shipping_options:List[str] | None = Field(default = None)
sell_directly:bool | None = False
images:List[str] | None = Field(default = None)
contact:ContactPartial | None = None
republication_interval:int = 7
special_attributes:Dict[str, str] | None = _OPTIONAL()
price:int | None = _OPTIONAL()
price_type:Literal["FIXED", "NEGOTIABLE", "GIVE_AWAY", "NOT_APPLICABLE"] | None = _OPTIONAL()
shipping_type:Literal["PICKUP", "SHIPPING", "NOT_APPLICABLE"] | None = _OPTIONAL()
shipping_costs:float | None = _OPTIONAL()
shipping_options:List[str] | None = _OPTIONAL()
sell_directly:bool | None = _OPTIONAL()
images:List[str] | None = _OPTIONAL()
contact:ContactPartial | None = _OPTIONAL()
republication_interval:int | None = _OPTIONAL()
id:int | None = None
created_on:datetime | None = _iso_datetime_field()
updated_on:datetime | None = _iso_datetime_field()
content_hash:str | None = None
id:int | None = _OPTIONAL()
created_on:datetime | None = _ISO_DATETIME()
updated_on:datetime | None = _ISO_DATETIME()
content_hash:str | None = _OPTIONAL()
@validator("created_on", "updated_on", pre = True)
@classmethod
@@ -105,11 +113,71 @@ class AdPartial(ContextualModel):
raise ValueError("shipping_options entries must be non-empty")
return v
def update_content_hash(self) -> Self:
"""Calculate and updates the content_hash value for user-modifiable fields of the ad."""
# 1) Dump to a plain dict, excluding the metadata fields:
raw = self.model_dump(
exclude = {"id", "created_on", "updated_on", "content_hash"},
exclude_none = True,
exclude_unset = True,
)
# 2) Recursively prune any empty containers:
def prune(obj:Any) -> Any:
if isinstance(obj, Mapping):
return {
k: prune(v)
for k, v in obj.items()
# drop keys whose values are empty list/dict/set
if not (isinstance(v, (Mapping, Sequence, set)) and not isinstance(v, (str, bytes)) and len(v) == 0)
}
if isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
return [
prune(v)
for v in obj
if not (isinstance(v, (Mapping, Sequence, set)) and not isinstance(v, (str, bytes)) and len(v) == 0)
]
return obj
pruned = prune(raw)
# 3) Produce a canonical JSON string and hash it:
json_string = json.dumps(pruned, sort_keys = True)
self.content_hash = hashlib.sha256(json_string.encode()).hexdigest()
return self
def to_ad(self, ad_defaults:AdDefaults) -> Ad:
"""
Returns a complete, validated Ad by merging this partial with values from ad_defaults.
Any field that is `None` or `""` is filled from `ad_defaults`.
Raises `ValidationError` when, after merging with `ad_defaults`, not all fields required by `Ad` are populated.
"""
ad_cfg = self.model_dump()
dicts.apply_defaults(
target = ad_cfg,
defaults = ad_defaults.model_dump(),
ignore = lambda k, _: k == "description", # ignore legacy global description config
override = lambda _, v: v in {None, ""} # noqa: PLC1901 can be simplified
)
return Ad.model_validate(ad_cfg)
# pyright: reportGeneralTypeIssues=false, reportIncompatibleVariableOverride=false
class Contact(ContactPartial):
name:str # pyright: ignore[reportGeneralTypeIssues, reportIncompatibleVariableOverride]
zipcode:int | str # pyright: ignore[reportGeneralTypeIssues, reportIncompatibleVariableOverride]
name:str
zipcode:int | str
# pyright: reportGeneralTypeIssues=false, reportIncompatibleVariableOverride=false
class Ad(AdPartial):
contact:Contact # pyright: ignore[reportGeneralTypeIssues, reportIncompatibleVariableOverride]
active:bool
type:Literal["OFFER", "WANTED"]
description:str
price_type:Literal["FIXED", "NEGOTIABLE", "GIVE_AWAY", "NOT_APPLICABLE"]
shipping_type:Literal["PICKUP", "SHIPPING", "NOT_APPLICABLE"]
sell_directly:bool
contact:Contact
republication_interval:int

View File

@@ -4,12 +4,13 @@
from __future__ import annotations
import copy
from typing import Any, Dict, List, Literal
from typing import Any, List, Literal
from pydantic import Field, model_validator, validator
from typing_extensions import deprecated
from kleinanzeigen_bot.utils import dicts
from kleinanzeigen_bot.utils.misc import get_attr
from kleinanzeigen_bot.utils.pydantics import ContextualModel
@@ -40,16 +41,17 @@ class AdDefaults(ContextualModel):
@model_validator(mode = "before")
@classmethod
def unify_description(cls, values:Dict[str, Any]) -> Dict[str, Any]:
def migrate_legacy_description(cls, values:dict[str, Any]) -> dict[str, Any]:
# Ensure flat prefix/suffix take precedence over deprecated nested "description"
desc = values.get("description")
flat_prefix = values.get("description_prefix")
flat_suffix = values.get("description_suffix")
description_prefix = values.get("description_prefix")
description_suffix = values.get("description_suffix")
legacy_prefix = get_attr(values, "description.prefix")
legacy_suffix = get_attr(values, "description.suffix")
if not flat_prefix and isinstance(desc, dict) and desc.get("prefix") is not None:
values["description_prefix"] = desc.get("prefix", "")
if not flat_suffix and isinstance(desc, dict) and desc.get("suffix") is not None:
values["description_suffix"] = desc.get("suffix", "")
if not description_prefix and legacy_prefix is not None:
values["description_prefix"] = legacy_prefix
if not description_suffix and legacy_suffix is not None:
values["description_suffix"] = legacy_suffix
return values
@@ -115,7 +117,7 @@ if relative paths are specified, then they are relative to this configuration fi
description = "Default values for ads, can be overwritten in each ad configuration file"
)
categories:Dict[str, str] = Field(default_factory = dict, description = """
categories:dict[str, str] = Field(default_factory = dict, description = """
additional name to category ID mappings, see default list at
https://github.com/Second-Hand-Friends/kleinanzeigen-bot/blob/main/src/kleinanzeigen_bot/resources/categories.yaml

View File

@@ -133,6 +133,7 @@ kleinanzeigen_bot/__init__.py:
run:
"DONE: No configuration errors found.": "FERTIG: Keine Konfigurationsfehler gefunden."
"DONE: No active ads found.": "FERTIG: Keine aktiven Anzeigen gefunden."
"You provided no ads selector. Defaulting to \"due\".": "Es wurden keine Anzeigen-Selektor angegeben. Es wird \"due\" verwendet."
"DONE: No new/outdated ads found.": "FERTIG: Keine neuen/veralteten Anzeigen gefunden."
"DONE: No ads to delete found.": "FERTIG: Keine zu löschnenden Anzeigen gefunden."
@@ -148,6 +149,11 @@ kleinanzeigen_bot/__init__.py:
__set_shipping_options:
"Unable to close shipping dialog!": "Versanddialog konnte nicht geschlossen werden!"
update_content_hashes:
"DONE: Updated [content_hash] in %s": "FERTIG: [content_hash] in %s aktualisiert."
"Processing %s/%s: '%s' from [%s]...": "Verarbeite %s/%s: '%s' von [%s]..."
"ad": "Anzeige"
#################################################
kleinanzeigen_bot/extract.py:
#################################################

View File

@@ -67,7 +67,7 @@ def configure_console_logging() -> None:
CRITICAL: colorama.Fore.MAGENTA,
}
def _relativize_paths_under_cwd(self, record: logging.LogRecord) -> None:
def _relativize_paths_under_cwd(self, record:logging.LogRecord) -> None:
"""
Mutate record.args in-place, converting any absolute-path strings
under the current working directory into relative paths.
@@ -78,7 +78,7 @@ def configure_console_logging() -> None:
cwd = os.getcwd()
def _rel_if_subpath(val: Any) -> Any:
def _rel_if_subpath(val:Any) -> Any:
if isinstance(val, str) and os.path.isabs(val):
# don't relativize log-file paths
if val.endswith(".log"):