mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
Co-authored-by: sebthom <sebthom@users.noreply.github.com>
This commit is contained in:
@@ -281,7 +281,6 @@ browser:
|
||||
login:
|
||||
username: ""
|
||||
password: ""
|
||||
|
||||
```
|
||||
|
||||
### <a name="ad-config"></a>2) Ad configuration
|
||||
@@ -342,9 +341,11 @@ contact:
|
||||
|
||||
republication_interval: # every X days the ad should be re-published
|
||||
|
||||
id: # set automatically
|
||||
created_on: # set automatically
|
||||
updated_on: # set automatically
|
||||
# The following fields are automatically managed by the bot:
|
||||
id: # the ID assigned by kleinanzeigen.de
|
||||
created_on: # ISO timestamp when the ad was first published
|
||||
updated_on: # ISO timestamp when the ad was last published
|
||||
content_hash: # hash of the ad content, used to detect changes
|
||||
```
|
||||
|
||||
### <a name="existing-browser"></a>3) Using an existing browser window
|
||||
|
||||
@@ -19,7 +19,7 @@ from wcmatch import glob
|
||||
|
||||
from . import utils, resources, extract
|
||||
from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
|
||||
from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime
|
||||
from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
|
||||
from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
|
||||
from ._version import __version__
|
||||
|
||||
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
LOG.info("App version: %s", self.get_version())
|
||||
LOG.info("Python version: %s", sys.version)
|
||||
|
||||
def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
|
||||
"""
|
||||
Check if an ad needs to be republished based on changes and republication interval.
|
||||
Returns True if the ad should be republished.
|
||||
"""
|
||||
if ad_cfg["updated_on"]:
|
||||
last_updated_on = parse_datetime(ad_cfg["updated_on"])
|
||||
elif ad_cfg["created_on"]:
|
||||
last_updated_on = parse_datetime(ad_cfg["created_on"])
|
||||
else:
|
||||
return True
|
||||
|
||||
if not last_updated_on:
|
||||
return True
|
||||
|
||||
# Check for changes first
|
||||
if ad_cfg["id"]:
|
||||
current_hash = calculate_content_hash(ad_cfg)
|
||||
stored_hash = ad_cfg_orig.get("content_hash")
|
||||
|
||||
LOG.debug("Hash comparison for [%s]:", ad_file_relative)
|
||||
LOG.debug(" Stored hash: %s", stored_hash)
|
||||
LOG.debug(" Current hash: %s", current_hash)
|
||||
|
||||
if stored_hash and current_hash == stored_hash:
|
||||
# No changes - check republication interval
|
||||
ad_age = datetime.utcnow() - last_updated_on
|
||||
if ad_age.days <= ad_cfg["republication_interval"]:
|
||||
LOG.info(
|
||||
" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
|
||||
ad_file_relative,
|
||||
ad_age.days,
|
||||
ad_cfg["republication_interval"]
|
||||
)
|
||||
return False
|
||||
else:
|
||||
LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
|
||||
# Update hash in original configuration
|
||||
ad_cfg_orig["content_hash"] = current_hash
|
||||
return True
|
||||
|
||||
return True
|
||||
|
||||
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
|
||||
LOG.info("Searching for ad config files...")
|
||||
|
||||
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
if not ad_files:
|
||||
return []
|
||||
|
||||
descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or ""
|
||||
descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or ""
|
||||
description_config = {
|
||||
"prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
|
||||
"suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
|
||||
}
|
||||
|
||||
ids = []
|
||||
use_specific_ads = False
|
||||
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
continue
|
||||
|
||||
if self.ads_selector == "due":
|
||||
if ad_cfg["updated_on"]:
|
||||
last_updated_on = parse_datetime(ad_cfg["updated_on"])
|
||||
elif ad_cfg["created_on"]:
|
||||
last_updated_on = parse_datetime(ad_cfg["created_on"])
|
||||
else:
|
||||
last_updated_on = None
|
||||
if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
|
||||
continue
|
||||
|
||||
if last_updated_on:
|
||||
ad_age = datetime.utcnow() - last_updated_on
|
||||
if ad_age.days <= ad_cfg["republication_interval"]:
|
||||
LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
|
||||
ad_file_relative,
|
||||
ad_age.days,
|
||||
ad_cfg["republication_interval"]
|
||||
)
|
||||
continue
|
||||
|
||||
ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
|
||||
ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
|
||||
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
|
||||
ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
|
||||
|
||||
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
|
||||
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
|
||||
|
||||
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
|
||||
if not ad_cfg["created_on"] and not ad_cfg["id"]:
|
||||
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
|
||||
|
||||
# extract the ad id from the URL's query parameter
|
||||
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
|
||||
ad_id = int(current_url_query_params.get("adId", [])[0])
|
||||
ad_cfg_orig["id"] = ad_id
|
||||
|
||||
# Update content hash after successful publication
|
||||
ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
|
||||
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
|
||||
if not ad_cfg["created_on"] and not ad_cfg["id"]:
|
||||
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
|
||||
|
||||
LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
|
||||
|
||||
utils.save_dict(ad_file, ad_cfg_orig)
|
||||
|
||||
@@ -11,7 +11,7 @@ from typing import Any, Final
|
||||
import json
|
||||
|
||||
from .i18n import get_translating_logger, pluralize
|
||||
from .utils import is_integer, parse_decimal, save_dict
|
||||
from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
|
||||
from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
|
||||
|
||||
__all__ = [
|
||||
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
|
||||
info['created_on'] = creation_date
|
||||
info['updated_on'] = None # will be set later on
|
||||
|
||||
# Calculate the initial hash for the downloaded ad
|
||||
info['content_hash'] = calculate_content_hash(info)
|
||||
|
||||
return info
|
||||
|
||||
async def _extract_category_from_ad_page(self) -> str:
|
||||
|
||||
@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
|
||||
'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
|
||||
'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
|
||||
|
||||
__check_ad_republication:
|
||||
"Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
|
||||
" Stored hash: %s": " Gespeicherter Hash: %s"
|
||||
" Current hash: %s": " Aktueller Hash: %s"
|
||||
"Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
|
||||
|
||||
|
||||
#################################################
|
||||
kleinanzeigen_bot/extract.py:
|
||||
|
||||
@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
||||
SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||
"""
|
||||
import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time
|
||||
import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
|
||||
from importlib.resources import read_text as get_resource_as_string
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
|
||||
if isinstance(date, datetime):
|
||||
return date
|
||||
return datetime.fromisoformat(date)
|
||||
|
||||
|
||||
def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
|
||||
"""Calculate a hash for user-modifiable fields of the ad."""
|
||||
|
||||
# Relevant fields for the hash
|
||||
content = {
|
||||
"active": bool(ad_cfg.get("active", True)), # Explicitly convert to bool
|
||||
"type": str(ad_cfg.get("type", "")), # Explicitly convert to string
|
||||
"title": str(ad_cfg.get("title", "")),
|
||||
"description": str(ad_cfg.get("description", "")),
|
||||
"category": str(ad_cfg.get("category", "")),
|
||||
"price": str(ad_cfg.get("price", "")), # Price always as string
|
||||
"price_type": str(ad_cfg.get("price_type", "")),
|
||||
"special_attributes": dict(ad_cfg.get("special_attributes", {})), # Copy the dict
|
||||
"shipping_type": str(ad_cfg.get("shipping_type", "")),
|
||||
"shipping_costs": str(ad_cfg.get("shipping_costs", "")),
|
||||
"shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]), # Convert to list and sort
|
||||
"sell_directly": bool(ad_cfg.get("sell_directly", False)), # Explicitly convert to bool
|
||||
"images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]), # Only filenames
|
||||
"contact": {
|
||||
"name": str(ad_cfg.get("contact", {}).get("name", "")),
|
||||
"street": str(ad_cfg.get("contact", {}).get("street", "None")), # Explicitly "None" as string for None values
|
||||
"zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
|
||||
"phone": str(ad_cfg.get("contact", {}).get("phone", ""))
|
||||
}
|
||||
}
|
||||
|
||||
# Create sorted JSON string for consistent hashes
|
||||
content_str = json.dumps(content, sort_keys=True)
|
||||
return hashlib.sha256(content_str.encode()).hexdigest()
|
||||
|
||||
Reference in New Issue
Block a user