feat: cache published ads data to avoid repetitive API calls during ad download (#809)

This commit is contained in:
Jens
2026-02-03 14:51:59 +01:00
committed by GitHub
parent e994ce1b1f
commit a8051c3814
5 changed files with 136 additions and 326 deletions

View File

@@ -581,11 +581,7 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
dicts.save_commented_model(
self.config_file_path,
default_config,
header=(
"# yaml-language-server: $schema="
"https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot"
"/main/schemas/config.schema.json"
),
header = ("# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/main/schemas/config.schema.json"),
exclude = {"ad_defaults": {"description"}},
)
@@ -2020,8 +2016,21 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
Determines which download mode was chosen with the arguments, and calls the specified download routine.
This downloads either all, only unsaved (new), or specific ads given by ID.
"""
# Fetch published ads once from manage-ads JSON to avoid repetitive API calls during extraction
# Build lookup dict inline and pass directly to extractor (no cache abstraction needed)
LOG.info("Fetching published ads...")
published_ads = await self._fetch_published_ads()
published_ads_by_id:dict[int, dict[str, Any]] = {}
for published_ad in published_ads:
try:
ad_id = published_ad.get("id")
if ad_id is not None:
published_ads_by_id[int(ad_id)] = published_ad
except (ValueError, TypeError):
LOG.warning("Skipping ad with non-numeric id: %s", published_ad.get("id"))
LOG.info("Loaded %s published ads.", len(published_ads_by_id))
ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable)
ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable, published_ads_by_id = published_ads_by_id)
# use relevant download routine
if self.ads_selector in {"all", "new"}: # explore ads overview for these two modes

View File

@@ -25,7 +25,6 @@ __all__ = [
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
_BREADCRUMB_MIN_DEPTH:Final[int] = 2
_SELL_DIRECTLY_MAX_PAGE_LIMIT:Final[int] = 100
BREADCRUMB_RE = re.compile(r"/c(\d+)")
@@ -34,13 +33,20 @@ class AdExtractor(WebScrapingMixin):
Wrapper class for ad extraction that uses an active bot´s browser session to extract specific elements from an ad page.
"""
def __init__(self, browser:Browser, config:Config, installation_mode:xdg_paths.InstallationMode = "portable") -> None:
def __init__(
self,
browser:Browser,
config:Config,
installation_mode:xdg_paths.InstallationMode = "portable",
published_ads_by_id:dict[int, dict[str, Any]] | None = None,
) -> None:
super().__init__()
self.browser = browser
self.config:Config = config
if installation_mode not in {"portable", "xdg"}:
raise ValueError(f"Unsupported installation mode: {installation_mode}")
self.installation_mode:xdg_paths.InstallationMode = installation_mode
self.published_ads_by_id:dict[int, dict[str, Any]] = published_ads_by_id or {}
async def download_ad(self, ad_id:int) -> None:
"""
@@ -231,14 +237,19 @@ class AdExtractor(WebScrapingMixin):
"""
info:dict[str, Any] = {"active": True}
# extract basic info
info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED"
# Extract title
# Extract title first (needed for directory creation)
title = await self._extract_title_from_ad_page()
# Get BelenConf data which contains accurate ad_type information
belen_conf = await self.web_execute("window.BelenConf")
# Extract ad type from BelenConf - more reliable than URL pattern matching
# BelenConf contains "ad_type":"WANTED" or "ad_type":"OFFER" in dimensions
ad_type_from_conf = None
if isinstance(belen_conf, dict):
ad_type_from_conf = belen_conf.get("universalAnalyticsOpts", {}).get("dimensions", {}).get("ad_type")
info["type"] = ad_type_from_conf if ad_type_from_conf in {"OFFER", "WANTED"} else ("OFFER" if "s-anzeige" in self.page.url else "WANTED")
info["category"] = await self._extract_category_from_ad_page()
# append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
@@ -515,72 +526,35 @@ class AdExtractor(WebScrapingMixin):
async def _extract_sell_directly_from_ad_page(self) -> bool | None:
"""
Extracts the sell directly option from an ad page using the JSON API.
Extracts the sell directly option from an ad page using the published ads data.
Uses data passed at construction time (from the manage-ads JSON) to avoid
repetitive API calls that create a bot detection signature.
:return: bool | None - True if buyNowEligible, False if not eligible, None if unknown
"""
try:
# Extract current ad ID from the page URL first
# Extract current ad ID from the page URL
current_ad_id = self.extract_ad_id_from_ad_url(self.page.url)
if current_ad_id == -1:
LOG.warning("Could not extract ad ID from URL: %s", self.page.url)
return None
# Fetch the management JSON data using web_request with pagination support
page = 1
# Direct dict lookup (O(1) instead of O(pages) API calls)
cached_ad = self.published_ads_by_id.get(current_ad_id)
if cached_ad is not None:
buy_now_eligible = cached_ad.get("buyNowEligible")
if isinstance(buy_now_eligible, bool):
LOG.debug("sell_directly from data for ad %s: %s", current_ad_id, buy_now_eligible)
return buy_now_eligible
LOG.debug("buyNowEligible not a bool for ad %s: %s", current_ad_id, buy_now_eligible)
return None
while True:
# Safety check: don't paginate beyond reasonable limit
if page > _SELL_DIRECTLY_MAX_PAGE_LIMIT:
LOG.warning("Stopping pagination after %s pages to avoid infinite loop", _SELL_DIRECTLY_MAX_PAGE_LIMIT)
break
response = await self.web_request(f"https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
try:
json_data = json.loads(response["content"])
except json.JSONDecodeError as ex:
LOG.debug("Failed to parse JSON response on page %s: %s", page, ex)
break
# Find the current ad in the ads list
if isinstance(json_data, dict) and "ads" in json_data:
ads_list = json_data["ads"]
if isinstance(ads_list, list):
# Filter ads to find the current ad by ID
current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
if current_ad and "buyNowEligible" in current_ad:
buy_now_eligible = current_ad["buyNowEligible"]
return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
# Check if we need to fetch more pages
paging = json_data.get("paging") if isinstance(json_data, dict) else None
if not isinstance(paging, dict):
break
# Parse pagination info using real API fields
current_page_num = misc.coerce_page_number(paging.get("pageNum"))
total_pages = misc.coerce_page_number(paging.get("last"))
if current_page_num is None:
LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
break
# Stop if we've reached the last page
if total_pages is None or current_page_num >= total_pages:
break
# Use API's next field for navigation (more robust than our counter)
next_page = misc.coerce_page_number(paging.get("next"))
if next_page is None:
LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
break
page = next_page
# If the key doesn't exist or ad not found, return None (unknown)
# Ad not in user's published ads (may be someone else's ad)
LOG.debug("No data for ad %s, returning None for sell_directly", current_ad_id)
return None
except (TimeoutError, json.JSONDecodeError, KeyError, TypeError) as e:
except (KeyError, TypeError) as e:
LOG.debug("Could not determine sell_directly status: %s", e)
return None

View File

@@ -225,12 +225,15 @@ kleinanzeigen_bot/__init__.py:
"Attribute field '%s' seems to be a Combobox (i.e. text input with filtering dropdown)...": "Attributfeld '%s' scheint eine Combobox zu sein (d.h. Texteingabefeld mit Dropdown-Filter)..."
download_ads:
"Fetching published ads...": "Lade veröffentlichte Anzeigen..."
"Loaded %s published ads.": "%s veröffentlichte Anzeigen geladen."
"Scanning your ad overview...": "Scanne Anzeigenübersicht..."
"%s found.": "%s gefunden."
"ad": "Anzeige"
"Starting download of all ads...": "Starte den Download aller Anzeigen..."
"%d of %d ads were downloaded from your profile.": "%d von %d Anzeigen wurden aus Ihrem Profil heruntergeladen."
"Starting download of not yet downloaded ads...": "Starte den Download noch nicht heruntergeladener Anzeigen..."
"Skipping ad with non-numeric id: %s": "Überspringe Anzeige mit nicht-numerischer ID: %s"
"The ad with id %d has already been saved.": "Die Anzeige mit der ID %d wurde bereits gespeichert."
"%s were downloaded from your profile.": "%s wurden aus Ihrem Profil heruntergeladen."
"new ad": "neue Anzeige"
@@ -317,9 +320,6 @@ kleinanzeigen_bot/extract.py:
_extract_sell_directly_from_ad_page:
"Could not extract ad ID from URL: %s": "Konnte Anzeigen-ID nicht aus der URL extrahieren: %s"
"Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
"Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
"Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"
#################################################
kleinanzeigen_bot/utils/i18n.py: