enh: allow per-ad overriding of global description affixes (#416)

This commit is contained in:
Jens Bergmann
2025-02-11 23:39:26 +01:00
committed by GitHub
parent a67112d936
commit 4051620aed
9 changed files with 559 additions and 66 deletions

View File

@@ -17,7 +17,7 @@ from ruamel.yaml import YAML
from wcmatch import glob
from . import extract, resources
from .ads import calculate_content_hash
from .ads import calculate_content_hash, get_description_affixes
from .utils import dicts, error_handlers, loggers, misc
from .utils.files import abspath
from .utils.i18n import Locale, get_current_locale, set_current_locale, pluralize
@@ -318,11 +318,6 @@ class KleinanzeigenBot(WebScrapingMixin):
if not ad_files:
return []
description_config = {
"prefix": self.config["ad_defaults"]["description"]["prefix"] or "",
"suffix": self.config["ad_defaults"]["description"]["suffix"] or ""
}
ids = []
use_specific_ads = False
if re.compile(r'\d+[,\d+]*').search(self.ads_selector):
@@ -356,10 +351,18 @@ class KleinanzeigenBot(WebScrapingMixin):
if not self.__check_ad_republication(ad_cfg, ad_cfg_orig, ad_file_relative):
continue
ad_cfg["description"] = description_config["prefix"] + (ad_cfg["description"] or "") + description_config["suffix"]
# Get prefix/suffix from ad config if present, otherwise use defaults
prefix = ad_cfg.get("prefix", self.config["ad_defaults"]["description"]["prefix"] or "")
suffix = ad_cfg.get("suffix", self.config["ad_defaults"]["description"]["suffix"] or "")
# Combine description parts
ad_cfg["description"] = prefix + (ad_cfg["description"] or "") + suffix
ad_cfg["description"] = ad_cfg["description"].replace("@", "(at)")
ensure(len(ad_cfg["description"]) <= 4000, f"""Length of ad description including prefix and suffix exceeds 4000 chars. Description length: {
len(ad_cfg['description'])} chars. @ {ad_file}""")
# Validate total length
ensure(len(ad_cfg["description"]) <= 4000,
f"""Length of ad description including prefix and suffix exceeds 4000 chars. Description length: {
len(ad_cfg["description"])} chars. @ {ad_file}.""")
# pylint: disable=cell-var-from-loop
def assert_one_of(path:str, allowed:Iterable[str]) -> None:
@@ -693,7 +696,8 @@ class KleinanzeigenBot(WebScrapingMixin):
#############################
# set description
#############################
await self.web_execute("document.querySelector('#pstad-descrptn').value = `" + ad_cfg["description"].replace("`", "'") + "`")
description = self.__get_description_with_affixes(ad_cfg)
await self.web_execute("document.querySelector('#pstad-descrptn').value = `" + description.replace("`", "'") + "`")
#############################
# set contact zipcode
@@ -1040,10 +1044,64 @@ class KleinanzeigenBot(WebScrapingMixin):
else:
LOG.error('The page with the id %d does not exist!', ad_id)
def __get_description_with_affixes(self, ad_cfg: dict[str, Any]) -> str:
"""Get the complete description with prefix and suffix applied.
Precedence (highest to lowest):
1. Direct ad-level affixes (description_prefix/suffix)
2. Legacy nested ad-level affixes (description.prefix/suffix)
3. Global flattened affixes (ad_defaults.description_prefix/suffix)
4. Legacy global nested affixes (ad_defaults.description.prefix/suffix)
Args:
ad_cfg: The ad configuration dictionary
Returns:
The complete description with prefix and suffix applied
"""
# Get the main description text
description_text = ""
if isinstance(ad_cfg.get("description"), dict):
description_text = ad_cfg["description"].get("text", "")
elif isinstance(ad_cfg.get("description"), str):
description_text = ad_cfg["description"]
# Get prefix with precedence
prefix = (
# 1. Direct ad-level prefix
ad_cfg.get("description_prefix") if ad_cfg.get("description_prefix") is not None
# 2. Legacy nested ad-level prefix
else dicts.safe_get(ad_cfg, "description", "prefix")
if dicts.safe_get(ad_cfg, "description", "prefix") is not None
# 3. Global prefix from config
else get_description_affixes(self.config, prefix=True)
)
# Get suffix with precedence
suffix = (
# 1. Direct ad-level suffix
ad_cfg.get("description_suffix") if ad_cfg.get("description_suffix") is not None
# 2. Legacy nested ad-level suffix
else dicts.safe_get(ad_cfg, "description", "suffix")
if dicts.safe_get(ad_cfg, "description", "suffix") is not None
# 3. Global suffix from config
else get_description_affixes(self.config, prefix=False)
)
# Combine the parts
final_description = str(prefix) + str(description_text) + str(suffix)
# Validate length
ensure(len(final_description) <= 4000,
f"Length of ad description including prefix and suffix exceeds 4000 chars. Description length: {len(final_description)} chars.")
return final_description
#############################
# main entry point
#############################
def main(args:list[str]) -> None:
if "version" not in args:
print(textwrap.dedent(r"""

View File

@@ -5,6 +5,7 @@ SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanze
"""
import hashlib, json, os
from typing import Any
from .utils import dicts
def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
@@ -36,3 +37,51 @@ def calculate_content_hash(ad_cfg: dict[str, Any]) -> str:
# Create sorted JSON string for consistent hashes
content_str = json.dumps(content, sort_keys = True)
return hashlib.sha256(content_str.encode()).hexdigest()
def get_description_affixes(config: dict[str, Any], prefix: bool = True) -> str:
"""Get prefix or suffix for description with proper precedence.
This function handles both the new flattened format and legacy nested format:
New format (flattened):
ad_defaults:
description_prefix: "Global Prefix"
description_suffix: "Global Suffix"
Legacy format (nested):
ad_defaults:
description:
prefix: "Legacy Prefix"
suffix: "Legacy Suffix"
Args:
config: Configuration dictionary containing ad_defaults
prefix: If True, get prefix, otherwise get suffix
Returns:
The appropriate affix string, empty string if none found
Example:
>>> config = {"ad_defaults": {"description_prefix": "Hello", "description": {"prefix": "Hi"}}}
>>> get_description_affixes(config, prefix=True)
'Hello'
"""
# Handle edge cases
if not isinstance(config, dict):
return ""
affix_type = "prefix" if prefix else "suffix"
# First try new flattened format (description_prefix/description_suffix)
flattened_key = f"description_{affix_type}"
flattened_value = dicts.safe_get(config, "ad_defaults", flattened_key)
if isinstance(flattened_value, str):
return flattened_value
# Then try legacy nested format (description.prefix/description.suffix)
nested_value = dicts.safe_get(config, "ad_defaults", "description", affix_type)
if isinstance(nested_value, str):
return nested_value
return ""

View File

@@ -8,7 +8,7 @@ import urllib.request as urllib_request
from datetime import datetime
from typing import Any, Final
from .ads import calculate_content_hash
from .ads import calculate_content_hash, get_description_affixes
from .utils import dicts, i18n, loggers, misc, reflect
from .utils.web_scraping_mixin import Browser, By, Element, Is, WebScrapingMixin
@@ -236,9 +236,23 @@ class AdExtractor(WebScrapingMixin):
info['category'] = await self._extract_category_from_ad_page()
info['title'] = title
info['description'] = (await self.web_text(By.ID, 'viewad-description-text')).strip() \
.removeprefix((self.config["ad_defaults"]["description"]["prefix"] or "").strip()) \
.removesuffix((self.config["ad_defaults"]["description"]["suffix"] or "").strip())
# Get raw description text
raw_description = (await self.web_text(By.ID, 'viewad-description-text')).strip()
# Get prefix and suffix from config
prefix = get_description_affixes(self.config, prefix=True)
suffix = get_description_affixes(self.config, prefix=False)
# Remove prefix and suffix if present
description_text = raw_description
if prefix and description_text.startswith(prefix.strip()):
description_text = description_text[len(prefix.strip()):]
if suffix and description_text.endswith(suffix.strip()):
description_text = description_text[:-len(suffix.strip())]
info['description'] = description_text.strip()
info['special_attributes'] = await self._extract_special_attributes_from_ad_page()
if "art_s" in info['special_attributes']:
# change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"

View File

@@ -5,9 +5,9 @@ ad_files:
ad_defaults:
active: true
type: OFFER # one of: OFFER, WANTED
description:
prefix: ""
suffix: ""
description_prefix: "" # prefix for the ad description
description_suffix: "" # suffix for the ad description
price_type: NEGOTIABLE # one of: FIXED, NEGOTIABLE, GIVE_AWAY, NOT_APPLICABLE
shipping_type: SHIPPING # one of: PICKUP, SHIPPING, NOT_APPLICABLE
sell_directly: false # requires shipping_options to take effect