feat: cache published ads data to avoid repetitive API calls during ad download (#809)

2026-03-12 10:31:50 +01:00 · 2026-02-03 14:51:59 +01:00
parent e994ce1b1f
commit a8051c3814
5 changed files with 136 additions and 326 deletions
--- a/src/kleinanzeigen_bot/init.py
+++ b/src/kleinanzeigen_bot/init.py
@@ -581,11 +581,7 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
        dicts.save_commented_model(
            self.config_file_path,
            default_config,
-            header=(
-                "# yaml-language-server: $schema="
-                "https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot"
-                "/main/schemas/config.schema.json"
-            ),
+            header = ("# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/main/schemas/config.schema.json"),
            exclude = {"ad_defaults": {"description"}},
        )

@@ -2020,8 +2016,21 @@ class KleinanzeigenBot(WebScrapingMixin):  # noqa: PLR0904
        Determines which download mode was chosen with the arguments, and calls the specified download routine.
        This downloads either all, only unsaved (new), or specific ads given by ID.
        """
+        # Fetch published ads once from manage-ads JSON to avoid repetitive API calls during extraction
+        # Build lookup dict inline and pass directly to extractor (no cache abstraction needed)
+        LOG.info("Fetching published ads...")
+        published_ads = await self._fetch_published_ads()
+        published_ads_by_id:dict[int, dict[str, Any]] = {}
+        for published_ad in published_ads:
+            try:
+                ad_id = published_ad.get("id")
+                if ad_id is not None:
+                    published_ads_by_id[int(ad_id)] = published_ad
+            except (ValueError, TypeError):
+                LOG.warning("Skipping ad with non-numeric id: %s", published_ad.get("id"))
+        LOG.info("Loaded %s published ads.", len(published_ads_by_id))

-        ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable)
+        ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable, published_ads_by_id = published_ads_by_id)

        # use relevant download routine
        if self.ads_selector in {"all", "new"}:  # explore ads overview for these two modes
--- a/src/kleinanzeigen_bot/extract.py
+++ b/src/kleinanzeigen_bot/extract.py
@@ -25,7 +25,6 @@ __all__ = [
 LOG:Final[loggers.Logger] = loggers.get_logger(__name__)

 _BREADCRUMB_MIN_DEPTH:Final[int] = 2
-_SELL_DIRECTLY_MAX_PAGE_LIMIT:Final[int] = 100
 BREADCRUMB_RE = re.compile(r"/c(\d+)")


@@ -34,13 +33,20 @@ class AdExtractor(WebScrapingMixin):
    Wrapper class for ad extraction that uses an active bot´s browser session to extract specific elements from an ad page.
    """

-    def __init__(self, browser:Browser, config:Config, installation_mode:xdg_paths.InstallationMode = "portable") -> None:
+    def __init__(
+        self,
+        browser:Browser,
+        config:Config,
+        installation_mode:xdg_paths.InstallationMode = "portable",
+        published_ads_by_id:dict[int, dict[str, Any]] | None = None,
+    ) -> None:
        super().__init__()
        self.browser = browser
        self.config:Config = config
        if installation_mode not in {"portable", "xdg"}:
            raise ValueError(f"Unsupported installation mode: {installation_mode}")
        self.installation_mode:xdg_paths.InstallationMode = installation_mode
+        self.published_ads_by_id:dict[int, dict[str, Any]] = published_ads_by_id or {}

    async def download_ad(self, ad_id:int) -> None:
        """
@@ -231,14 +237,19 @@ class AdExtractor(WebScrapingMixin):
        """
        info:dict[str, Any] = {"active": True}

-        # extract basic info
-        info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED"
-
-        # Extract title
+        # Extract title first (needed for directory creation)
        title = await self._extract_title_from_ad_page()

+        # Get BelenConf data which contains accurate ad_type information
        belen_conf = await self.web_execute("window.BelenConf")

+        # Extract ad type from BelenConf - more reliable than URL pattern matching
+        # BelenConf contains "ad_type":"WANTED" or "ad_type":"OFFER" in dimensions
+        ad_type_from_conf = None
+        if isinstance(belen_conf, dict):
+            ad_type_from_conf = belen_conf.get("universalAnalyticsOpts", {}).get("dimensions", {}).get("ad_type")
+        info["type"] = ad_type_from_conf if ad_type_from_conf in {"OFFER", "WANTED"} else ("OFFER" if "s-anzeige" in self.page.url else "WANTED")
+
        info["category"] = await self._extract_category_from_ad_page()

        # append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
@@ -515,72 +526,35 @@ class AdExtractor(WebScrapingMixin):

    async def _extract_sell_directly_from_ad_page(self) -> bool | None:
        """
-        Extracts the sell directly option from an ad page using the JSON API.
+        Extracts the sell directly option from an ad page using the published ads data.
+
+        Uses data passed at construction time (from the manage-ads JSON) to avoid
+        repetitive API calls that create a bot detection signature.

        :return: bool | None - True if buyNowEligible, False if not eligible, None if unknown
        """
        try:
-            # Extract current ad ID from the page URL first
+            # Extract current ad ID from the page URL
            current_ad_id = self.extract_ad_id_from_ad_url(self.page.url)
            if current_ad_id == -1:
                LOG.warning("Could not extract ad ID from URL: %s", self.page.url)
                return None

-            # Fetch the management JSON data using web_request with pagination support
-            page = 1
+            # Direct dict lookup (O(1) instead of O(pages) API calls)
+            cached_ad = self.published_ads_by_id.get(current_ad_id)
+            if cached_ad is not None:
+                buy_now_eligible = cached_ad.get("buyNowEligible")
+                if isinstance(buy_now_eligible, bool):
+                    LOG.debug("sell_directly from data for ad %s: %s", current_ad_id, buy_now_eligible)
+                    return buy_now_eligible
+                LOG.debug("buyNowEligible not a bool for ad %s: %s", current_ad_id, buy_now_eligible)
+                return None

-            while True:
-                # Safety check: don't paginate beyond reasonable limit
-                if page > _SELL_DIRECTLY_MAX_PAGE_LIMIT:
-                    LOG.warning("Stopping pagination after %s pages to avoid infinite loop", _SELL_DIRECTLY_MAX_PAGE_LIMIT)
-                    break
-
-                response = await self.web_request(f"https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
-
-                try:
-                    json_data = json.loads(response["content"])
-                except json.JSONDecodeError as ex:
-                    LOG.debug("Failed to parse JSON response on page %s: %s", page, ex)
-                    break
-
-                # Find the current ad in the ads list
-                if isinstance(json_data, dict) and "ads" in json_data:
-                    ads_list = json_data["ads"]
-                    if isinstance(ads_list, list):
-                        # Filter ads to find the current ad by ID
-                        current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
-                        if current_ad and "buyNowEligible" in current_ad:
-                            buy_now_eligible = current_ad["buyNowEligible"]
-                            return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
-
-                # Check if we need to fetch more pages
-                paging = json_data.get("paging") if isinstance(json_data, dict) else None
-                if not isinstance(paging, dict):
-                    break
-
-                # Parse pagination info using real API fields
-                current_page_num = misc.coerce_page_number(paging.get("pageNum"))
-                total_pages = misc.coerce_page_number(paging.get("last"))
-
-                if current_page_num is None:
-                    LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
-                    break
-
-                # Stop if we've reached the last page
-                if total_pages is None or current_page_num >= total_pages:
-                    break
-
-                # Use API's next field for navigation (more robust than our counter)
-                next_page = misc.coerce_page_number(paging.get("next"))
-                if next_page is None:
-                    LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
-                    break
-                page = next_page
-
-            # If the key doesn't exist or ad not found, return None (unknown)
+            # Ad not in user's published ads (may be someone else's ad)
+            LOG.debug("No data for ad %s, returning None for sell_directly", current_ad_id)
            return None

-        except (TimeoutError, json.JSONDecodeError, KeyError, TypeError) as e:
+        except (KeyError, TypeError) as e:
            LOG.debug("Could not determine sell_directly status: %s", e)
            return None

--- a/src/kleinanzeigen_bot/resources/translations.de.yaml
+++ b/src/kleinanzeigen_bot/resources/translations.de.yaml
@@ -225,12 +225,15 @@ kleinanzeigen_bot/__init__.py:
    "Attribute field '%s' seems to be a Combobox (i.e. text input with filtering dropdown)...": "Attributfeld '%s' scheint eine Combobox zu sein (d.h. Texteingabefeld mit Dropdown-Filter)..."

  download_ads:
+    "Fetching published ads...": "Lade veröffentlichte Anzeigen..."
+    "Loaded %s published ads.": "%s veröffentlichte Anzeigen geladen."
    "Scanning your ad overview...": "Scanne Anzeigenübersicht..."
    "%s found.": "%s gefunden."
    "ad": "Anzeige"
    "Starting download of all ads...": "Starte den Download aller Anzeigen..."
    "%d of %d ads were downloaded from your profile.": "%d von %d Anzeigen wurden aus Ihrem Profil heruntergeladen."
    "Starting download of not yet downloaded ads...": "Starte den Download noch nicht heruntergeladener Anzeigen..."
+    "Skipping ad with non-numeric id: %s": "Überspringe Anzeige mit nicht-numerischer ID: %s"
    "The ad with id %d has already been saved.": "Die Anzeige mit der ID %d wurde bereits gespeichert."
    "%s were downloaded from your profile.": "%s wurden aus Ihrem Profil heruntergeladen."
    "new ad": "neue Anzeige"
@@ -317,9 +320,6 @@ kleinanzeigen_bot/extract.py:

  _extract_sell_directly_from_ad_page:
    "Could not extract ad ID from URL: %s": "Konnte Anzeigen-ID nicht aus der URL extrahieren: %s"
-    "Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
-    "Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
-    "Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"

 #################################################
 kleinanzeigen_bot/utils/i18n.py: