mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
feat: cache published ads data to avoid repetitive API calls during ad download (#809)
This commit is contained in:
@@ -581,11 +581,7 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
||||
dicts.save_commented_model(
|
||||
self.config_file_path,
|
||||
default_config,
|
||||
header=(
|
||||
"# yaml-language-server: $schema="
|
||||
"https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot"
|
||||
"/main/schemas/config.schema.json"
|
||||
),
|
||||
header = ("# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/main/schemas/config.schema.json"),
|
||||
exclude = {"ad_defaults": {"description"}},
|
||||
)
|
||||
|
||||
@@ -2020,8 +2016,21 @@ class KleinanzeigenBot(WebScrapingMixin): # noqa: PLR0904
|
||||
Determines which download mode was chosen with the arguments, and calls the specified download routine.
|
||||
This downloads either all, only unsaved (new), or specific ads given by ID.
|
||||
"""
|
||||
# Fetch published ads once from manage-ads JSON to avoid repetitive API calls during extraction
|
||||
# Build lookup dict inline and pass directly to extractor (no cache abstraction needed)
|
||||
LOG.info("Fetching published ads...")
|
||||
published_ads = await self._fetch_published_ads()
|
||||
published_ads_by_id:dict[int, dict[str, Any]] = {}
|
||||
for published_ad in published_ads:
|
||||
try:
|
||||
ad_id = published_ad.get("id")
|
||||
if ad_id is not None:
|
||||
published_ads_by_id[int(ad_id)] = published_ad
|
||||
except (ValueError, TypeError):
|
||||
LOG.warning("Skipping ad with non-numeric id: %s", published_ad.get("id"))
|
||||
LOG.info("Loaded %s published ads.", len(published_ads_by_id))
|
||||
|
||||
ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable)
|
||||
ad_extractor = extract.AdExtractor(self.browser, self.config, self.installation_mode_or_portable, published_ads_by_id = published_ads_by_id)
|
||||
|
||||
# use relevant download routine
|
||||
if self.ads_selector in {"all", "new"}: # explore ads overview for these two modes
|
||||
|
||||
@@ -25,7 +25,6 @@ __all__ = [
|
||||
LOG:Final[loggers.Logger] = loggers.get_logger(__name__)
|
||||
|
||||
_BREADCRUMB_MIN_DEPTH:Final[int] = 2
|
||||
_SELL_DIRECTLY_MAX_PAGE_LIMIT:Final[int] = 100
|
||||
BREADCRUMB_RE = re.compile(r"/c(\d+)")
|
||||
|
||||
|
||||
@@ -34,13 +33,20 @@ class AdExtractor(WebScrapingMixin):
|
||||
Wrapper class for ad extraction that uses an active bot´s browser session to extract specific elements from an ad page.
|
||||
"""
|
||||
|
||||
def __init__(self, browser:Browser, config:Config, installation_mode:xdg_paths.InstallationMode = "portable") -> None:
|
||||
def __init__(
|
||||
self,
|
||||
browser:Browser,
|
||||
config:Config,
|
||||
installation_mode:xdg_paths.InstallationMode = "portable",
|
||||
published_ads_by_id:dict[int, dict[str, Any]] | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.browser = browser
|
||||
self.config:Config = config
|
||||
if installation_mode not in {"portable", "xdg"}:
|
||||
raise ValueError(f"Unsupported installation mode: {installation_mode}")
|
||||
self.installation_mode:xdg_paths.InstallationMode = installation_mode
|
||||
self.published_ads_by_id:dict[int, dict[str, Any]] = published_ads_by_id or {}
|
||||
|
||||
async def download_ad(self, ad_id:int) -> None:
|
||||
"""
|
||||
@@ -231,14 +237,19 @@ class AdExtractor(WebScrapingMixin):
|
||||
"""
|
||||
info:dict[str, Any] = {"active": True}
|
||||
|
||||
# extract basic info
|
||||
info["type"] = "OFFER" if "s-anzeige" in self.page.url else "WANTED"
|
||||
|
||||
# Extract title
|
||||
# Extract title first (needed for directory creation)
|
||||
title = await self._extract_title_from_ad_page()
|
||||
|
||||
# Get BelenConf data which contains accurate ad_type information
|
||||
belen_conf = await self.web_execute("window.BelenConf")
|
||||
|
||||
# Extract ad type from BelenConf - more reliable than URL pattern matching
|
||||
# BelenConf contains "ad_type":"WANTED" or "ad_type":"OFFER" in dimensions
|
||||
ad_type_from_conf = None
|
||||
if isinstance(belen_conf, dict):
|
||||
ad_type_from_conf = belen_conf.get("universalAnalyticsOpts", {}).get("dimensions", {}).get("ad_type")
|
||||
info["type"] = ad_type_from_conf if ad_type_from_conf in {"OFFER", "WANTED"} else ("OFFER" if "s-anzeige" in self.page.url else "WANTED")
|
||||
|
||||
info["category"] = await self._extract_category_from_ad_page()
|
||||
|
||||
# append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
|
||||
@@ -515,72 +526,35 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
async def _extract_sell_directly_from_ad_page(self) -> bool | None:
|
||||
"""
|
||||
Extracts the sell directly option from an ad page using the JSON API.
|
||||
Extracts the sell directly option from an ad page using the published ads data.
|
||||
|
||||
Uses data passed at construction time (from the manage-ads JSON) to avoid
|
||||
repetitive API calls that create a bot detection signature.
|
||||
|
||||
:return: bool | None - True if buyNowEligible, False if not eligible, None if unknown
|
||||
"""
|
||||
try:
|
||||
# Extract current ad ID from the page URL first
|
||||
# Extract current ad ID from the page URL
|
||||
current_ad_id = self.extract_ad_id_from_ad_url(self.page.url)
|
||||
if current_ad_id == -1:
|
||||
LOG.warning("Could not extract ad ID from URL: %s", self.page.url)
|
||||
return None
|
||||
|
||||
# Fetch the management JSON data using web_request with pagination support
|
||||
page = 1
|
||||
# Direct dict lookup (O(1) instead of O(pages) API calls)
|
||||
cached_ad = self.published_ads_by_id.get(current_ad_id)
|
||||
if cached_ad is not None:
|
||||
buy_now_eligible = cached_ad.get("buyNowEligible")
|
||||
if isinstance(buy_now_eligible, bool):
|
||||
LOG.debug("sell_directly from data for ad %s: %s", current_ad_id, buy_now_eligible)
|
||||
return buy_now_eligible
|
||||
LOG.debug("buyNowEligible not a bool for ad %s: %s", current_ad_id, buy_now_eligible)
|
||||
return None
|
||||
|
||||
while True:
|
||||
# Safety check: don't paginate beyond reasonable limit
|
||||
if page > _SELL_DIRECTLY_MAX_PAGE_LIMIT:
|
||||
LOG.warning("Stopping pagination after %s pages to avoid infinite loop", _SELL_DIRECTLY_MAX_PAGE_LIMIT)
|
||||
break
|
||||
|
||||
response = await self.web_request(f"https://www.kleinanzeigen.de/m-meine-anzeigen-verwalten.json?sort=DEFAULT&pageNum={page}")
|
||||
|
||||
try:
|
||||
json_data = json.loads(response["content"])
|
||||
except json.JSONDecodeError as ex:
|
||||
LOG.debug("Failed to parse JSON response on page %s: %s", page, ex)
|
||||
break
|
||||
|
||||
# Find the current ad in the ads list
|
||||
if isinstance(json_data, dict) and "ads" in json_data:
|
||||
ads_list = json_data["ads"]
|
||||
if isinstance(ads_list, list):
|
||||
# Filter ads to find the current ad by ID
|
||||
current_ad = next((ad for ad in ads_list if ad.get("id") == current_ad_id), None)
|
||||
if current_ad and "buyNowEligible" in current_ad:
|
||||
buy_now_eligible = current_ad["buyNowEligible"]
|
||||
return buy_now_eligible if isinstance(buy_now_eligible, bool) else None
|
||||
|
||||
# Check if we need to fetch more pages
|
||||
paging = json_data.get("paging") if isinstance(json_data, dict) else None
|
||||
if not isinstance(paging, dict):
|
||||
break
|
||||
|
||||
# Parse pagination info using real API fields
|
||||
current_page_num = misc.coerce_page_number(paging.get("pageNum"))
|
||||
total_pages = misc.coerce_page_number(paging.get("last"))
|
||||
|
||||
if current_page_num is None:
|
||||
LOG.warning("Invalid 'pageNum' in paging info: %s, stopping pagination", paging.get("pageNum"))
|
||||
break
|
||||
|
||||
# Stop if we've reached the last page
|
||||
if total_pages is None or current_page_num >= total_pages:
|
||||
break
|
||||
|
||||
# Use API's next field for navigation (more robust than our counter)
|
||||
next_page = misc.coerce_page_number(paging.get("next"))
|
||||
if next_page is None:
|
||||
LOG.warning("Invalid 'next' page value in paging info: %s, stopping pagination", paging.get("next"))
|
||||
break
|
||||
page = next_page
|
||||
|
||||
# If the key doesn't exist or ad not found, return None (unknown)
|
||||
# Ad not in user's published ads (may be someone else's ad)
|
||||
LOG.debug("No data for ad %s, returning None for sell_directly", current_ad_id)
|
||||
return None
|
||||
|
||||
except (TimeoutError, json.JSONDecodeError, KeyError, TypeError) as e:
|
||||
except (KeyError, TypeError) as e:
|
||||
LOG.debug("Could not determine sell_directly status: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
@@ -225,12 +225,15 @@ kleinanzeigen_bot/__init__.py:
|
||||
"Attribute field '%s' seems to be a Combobox (i.e. text input with filtering dropdown)...": "Attributfeld '%s' scheint eine Combobox zu sein (d.h. Texteingabefeld mit Dropdown-Filter)..."
|
||||
|
||||
download_ads:
|
||||
"Fetching published ads...": "Lade veröffentlichte Anzeigen..."
|
||||
"Loaded %s published ads.": "%s veröffentlichte Anzeigen geladen."
|
||||
"Scanning your ad overview...": "Scanne Anzeigenübersicht..."
|
||||
"%s found.": "%s gefunden."
|
||||
"ad": "Anzeige"
|
||||
"Starting download of all ads...": "Starte den Download aller Anzeigen..."
|
||||
"%d of %d ads were downloaded from your profile.": "%d von %d Anzeigen wurden aus Ihrem Profil heruntergeladen."
|
||||
"Starting download of not yet downloaded ads...": "Starte den Download noch nicht heruntergeladener Anzeigen..."
|
||||
"Skipping ad with non-numeric id: %s": "Überspringe Anzeige mit nicht-numerischer ID: %s"
|
||||
"The ad with id %d has already been saved.": "Die Anzeige mit der ID %d wurde bereits gespeichert."
|
||||
"%s were downloaded from your profile.": "%s wurden aus Ihrem Profil heruntergeladen."
|
||||
"new ad": "neue Anzeige"
|
||||
@@ -317,9 +320,6 @@ kleinanzeigen_bot/extract.py:
|
||||
|
||||
_extract_sell_directly_from_ad_page:
|
||||
"Could not extract ad ID from URL: %s": "Konnte Anzeigen-ID nicht aus der URL extrahieren: %s"
|
||||
"Stopping pagination after %s pages to avoid infinite loop": "Stoppe die Seitenaufschaltung nach %s Seiten, um eine Endlosschleife zu vermeiden"
|
||||
"Invalid 'next' page value in paging info: %s, stopping pagination": "Ungültiger 'next'-Seitenwert in Paginierungsinfo: %s, beende Paginierung"
|
||||
"Invalid 'pageNum' in paging info: %s, stopping pagination": "Ungültiger 'pageNum'-Wert in Paginierungsinfo: %s, beende Paginierung"
|
||||
|
||||
#################################################
|
||||
kleinanzeigen_bot/utils/i18n.py:
|
||||
|
||||
Reference in New Issue
Block a user