mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
Co-authored-by: sebthom <sebthom@users.noreply.github.com>
This commit is contained in:
@@ -281,7 +281,6 @@ browser:
|
|||||||
login:
|
login:
|
||||||
username: ""
|
username: ""
|
||||||
password: ""
|
password: ""
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### <a name="ad-config"></a>2) Ad configuration
|
### <a name="ad-config"></a>2) Ad configuration
|
||||||
@@ -342,9 +341,11 @@ contact:
|
|||||||
|
|
||||||
republication_interval: # every X days the ad should be re-published
|
republication_interval: # every X days the ad should be re-published
|
||||||
|
|
||||||
id: # set automatically
|
# The following fields are automatically managed by the bot:
|
||||||
created_on: # set automatically
|
id: # the ID assigned by kleinanzeigen.de
|
||||||
updated_on: # set automatically
|
created_on: # ISO timestamp when the ad was first published
|
||||||
|
updated_on: # ISO timestamp when the ad was last published
|
||||||
|
content_hash: # hash of the ad content, used to detect changes
|
||||||
```
|
```
|
||||||
|
|
||||||
### <a name="existing-browser"></a>3) Using an existing browser window
|
### <a name="existing-browser"></a>3) Using an existing browser window
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ from wcmatch import glob
|
|||||||
|
|
||||||
from . import utils, resources, extract
|
from . import utils, resources, extract
|
||||||
from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
|
from .i18n import Locale, get_current_locale, set_current_locale, get_translating_logger, pluralize
|
||||||
from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime
|
from .utils import abspath, ainput, apply_defaults, ensure, is_frozen, safe_get, parse_datetime, calculate_content_hash
|
||||||
from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
|
from .web_scraping_mixin import By, Element, Page, Is, WebScrapingMixin
|
||||||
from ._version import __version__
|
from ._version import __version__
|
||||||
|
|
||||||
@@ -262,6 +262,49 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
LOG.info("App version: %s", self.get_version())
|
LOG.info("App version: %s", self.get_version())
|
||||||
LOG.info("Python version: %s", sys.version)
|
LOG.info("Python version: %s", sys.version)
|
||||||
|
|
||||||
|
def __check_ad_republication(self, ad_cfg: dict[str, Any], ad_cfg_orig: dict[str, Any], ad_file_relative: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if an ad needs to be republished based on changes and republication interval.
|
||||||
|
Returns True if the ad should be republished.
|
||||||
|
"""
|
||||||
|
if ad_cfg["updated_on"]:
|
||||||
|
last_updated_on = parse_datetime(ad_cfg["updated_on"])
|
||||||
|
elif ad_cfg["created_on"]:
|
||||||
|
last_updated_on = parse_datetime(ad_cfg["created_on"])
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if not last_updated_on:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check for changes first
|
||||||
|
if ad_cfg["id"]:
|
||||||
|
current_hash = calculate_content_hash(ad_cfg)
|
||||||
|
stored_hash = ad_cfg_orig.get("content_hash")
|
||||||
|
|
||||||
|
LOG.debug("Hash comparison for [%s]:", ad_file_relative)
|
||||||
|
LOG.debug(" Stored hash: %s", stored_hash)
|
||||||
|
LOG.debug(" Current hash: %s", current_hash)
|
||||||
|
|
||||||
|
if stored_hash and current_hash == stored_hash:
|
||||||
|
# No changes - check republication interval
|
||||||
|
ad_age = datetime.utcnow() - last_updated_on
|
||||||
|
if ad_age.days <= ad_cfg["republication_interval"]:
|
||||||
|
LOG.info(
|
||||||
|
" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
|
||||||
|
ad_file_relative,
|
||||||
|
ad_age.days,
|
||||||
|
ad_cfg["republication_interval"]
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
LOG.info("Changes detected in ad [%s], will republish", ad_file_relative)
|
||||||
|
# Update hash in original configuration
|
||||||
|
ad_cfg_orig["content_hash"] = current_hash
|
||||||
|
return True
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
|
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, dict[str, Any], dict[str, Any]]]:
|
||||||
LOG.info("Searching for ad config files...")
|
LOG.info("Searching for ad config files...")
|
||||||
|
|
||||||
@@ -275,8 +318,10 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
if not ad_files:
|
if not ad_files:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
descr_prefix = self.config["ad_defaults"]["description"]["prefix"] or ""
|
description_config = {
|
||||||
descr_suffix = self.config["ad_defaults"]["description"]["suffix"] or ""
|
"prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
|
||||||
|
"suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
|
||||||
|
}
|
||||||
|
|
||||||
ids = []
|
ids = []
|
||||||
use_specific_ads = False
|
use_specific_ads = False
|
||||||
@@ -308,24 +353,10 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if self.ads_selector == "due":
|
if self.ads_selector == "due":
|
||||||
if ad_cfg["updated_on"]:
|
if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
|
||||||
last_updated_on = parse_datetime(ad_cfg["updated_on"])
|
continue
|
||||||
elif ad_cfg["created_on"]:
|
|
||||||
last_updated_on = parse_datetime(ad_cfg["created_on"])
|
|
||||||
else:
|
|
||||||
last_updated_on = None
|
|
||||||
|
|
||||||
if last_updated_on:
|
ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
|
||||||
ad_age = datetime.utcnow() - last_updated_on
|
|
||||||
if ad_age.days <= ad_cfg["republication_interval"]:
|
|
||||||
LOG.info(" -> SKIPPED: ad [%s] was last published %d days ago. republication is only required every %s days",
|
|
||||||
ad_file_relative,
|
|
||||||
ad_age.days,
|
|
||||||
ad_cfg["republication_interval"]
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
ad_cfg["description"] = descr_prefix + (ad_cfg["description"] or "") + descr_suffix
|
|
||||||
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
|
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
|
||||||
ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
|
ensure(len(ad_cfg["description"]) <= 4000, f"Length of ad description including prefix and suffix exceeds 4000 chars. @ [{ad_file}]")
|
||||||
|
|
||||||
@@ -749,15 +780,17 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
|
|
||||||
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
|
await self.web_await(lambda: "p-anzeige-aufgeben-bestaetigung.html?adId=" in self.page.url, timeout = 20)
|
||||||
|
|
||||||
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
|
|
||||||
if not ad_cfg["created_on"] and not ad_cfg["id"]:
|
|
||||||
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
|
|
||||||
|
|
||||||
# extract the ad id from the URL's query parameter
|
# extract the ad id from the URL's query parameter
|
||||||
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
|
current_url_query_params = urllib_parse.parse_qs(urllib_parse.urlparse(self.page.url).query)
|
||||||
ad_id = int(current_url_query_params.get("adId", [])[0])
|
ad_id = int(current_url_query_params.get("adId", [])[0])
|
||||||
ad_cfg_orig["id"] = ad_id
|
ad_cfg_orig["id"] = ad_id
|
||||||
|
|
||||||
|
# Update content hash after successful publication
|
||||||
|
ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg)
|
||||||
|
ad_cfg_orig["updated_on"] = datetime.utcnow().isoformat()
|
||||||
|
if not ad_cfg["created_on"] and not ad_cfg["id"]:
|
||||||
|
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
|
||||||
|
|
||||||
LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
|
LOG.info(" -> SUCCESS: ad published with ID %s", ad_id)
|
||||||
|
|
||||||
utils.save_dict(ad_file, ad_cfg_orig)
|
utils.save_dict(ad_file, ad_cfg_orig)
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from typing import Any, Final
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
from .i18n import get_translating_logger, pluralize
|
from .i18n import get_translating_logger, pluralize
|
||||||
from .utils import is_integer, parse_decimal, save_dict
|
from .utils import is_integer, parse_decimal, save_dict, calculate_content_hash
|
||||||
from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
|
from .web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -269,6 +269,9 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
info['created_on'] = creation_date
|
info['created_on'] = creation_date
|
||||||
info['updated_on'] = None # will be set later on
|
info['updated_on'] = None # will be set later on
|
||||||
|
|
||||||
|
# Calculate the initial hash for the downloaded ad
|
||||||
|
info['content_hash'] = calculate_content_hash(info)
|
||||||
|
|
||||||
return info
|
return info
|
||||||
|
|
||||||
async def _extract_category_from_ad_page(self) -> str:
|
async def _extract_category_from_ad_page(self) -> str:
|
||||||
|
|||||||
@@ -113,6 +113,12 @@ kleinanzeigen_bot/__init__.py:
|
|||||||
'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
|
'Downloaded ad with id %d': 'Anzeige mit der ID %d heruntergeladen'
|
||||||
'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
|
'The page with the id %d does not exist!': 'Die Seite mit der ID %d existiert nicht!'
|
||||||
|
|
||||||
|
__check_ad_republication:
|
||||||
|
"Hash comparison for [%s]:": "Hash-Vergleich für [%s]:"
|
||||||
|
" Stored hash: %s": " Gespeicherter Hash: %s"
|
||||||
|
" Current hash: %s": " Aktueller Hash: %s"
|
||||||
|
"Changes detected in ad [%s], will republish": "Änderungen in Anzeige [%s] erkannt, wird neu veröffentlicht"
|
||||||
|
|
||||||
|
|
||||||
#################################################
|
#################################################
|
||||||
kleinanzeigen_bot/extract.py:
|
kleinanzeigen_bot/extract.py:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
|
|||||||
SPDX-License-Identifier: AGPL-3.0-or-later
|
SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
|
||||||
"""
|
"""
|
||||||
import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time
|
import asyncio, copy, decimal, inspect, json, logging, os, re, socket, sys, traceback, time, hashlib
|
||||||
from importlib.resources import read_text as get_resource_as_string
|
from importlib.resources import read_text as get_resource_as_string
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -333,3 +333,34 @@ def parse_datetime(date:datetime | str | None) -> datetime | None:
|
|||||||
if isinstance(date, datetime):
|
if isinstance(date, datetime):
|
||||||
return date
|
return date
|
||||||
return datetime.fromisoformat(date)
|
return datetime.fromisoformat(date)
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
|
||||||
|
"""Calculate a hash for user-modifiable fields of the ad."""
|
||||||
|
|
||||||
|
# Relevant fields for the hash
|
||||||
|
content = {
|
||||||
|
"active": bool(ad_cfg.get("active", True)), # Explicitly convert to bool
|
||||||
|
"type": str(ad_cfg.get("type", "")), # Explicitly convert to string
|
||||||
|
"title": str(ad_cfg.get("title", "")),
|
||||||
|
"description": str(ad_cfg.get("description", "")),
|
||||||
|
"category": str(ad_cfg.get("category", "")),
|
||||||
|
"price": str(ad_cfg.get("price", "")), # Price always as string
|
||||||
|
"price_type": str(ad_cfg.get("price_type", "")),
|
||||||
|
"special_attributes": dict(ad_cfg.get("special_attributes", {})), # Copy the dict
|
||||||
|
"shipping_type": str(ad_cfg.get("shipping_type", "")),
|
||||||
|
"shipping_costs": str(ad_cfg.get("shipping_costs", "")),
|
||||||
|
"shipping_options": sorted([str(x) for x in (ad_cfg.get("shipping_options") or [])]), # Convert to list and sort
|
||||||
|
"sell_directly": bool(ad_cfg.get("sell_directly", False)), # Explicitly convert to bool
|
||||||
|
"images": sorted([os.path.basename(img) if isinstance(img, str) else str(img) for img in ad_cfg.get("images", [])]), # Only filenames
|
||||||
|
"contact": {
|
||||||
|
"name": str(ad_cfg.get("contact", {}).get("name", "")),
|
||||||
|
"street": str(ad_cfg.get("contact", {}).get("street", "None")), # Explicitly "None" as string for None values
|
||||||
|
"zipcode": str(ad_cfg.get("contact", {}).get("zipcode", "")),
|
||||||
|
"phone": str(ad_cfg.get("contact", {}).get("phone", ""))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create sorted JSON string for consistent hashes
|
||||||
|
content_str = json.dumps(content, sort_keys=True)
|
||||||
|
return hashlib.sha256(content_str.encode()).hexdigest()
|
||||||
|
|||||||
Reference in New Issue
Block a user