mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 10:31:50 +01:00
feat: add configurable timeouts (#673)
## ℹ️ Description - Related issues: #671, #658 - Introduces configurable timeout controls plus retry/backoff handling for flaky DOM operations. We often see timeouts which are note reproducible in certain configurations. I suspect timeout issues based on a combination of internet speed, browser, os, age of the computer and the weather. This PR introduces a comprehensive config model to tweak timeouts. ## 📋 Changes Summary - add TimeoutConfig to the main config/schema and expose timeouts in README/docs - wire WebScrapingMixin, extractor, update checker, and browser diagnostics to honor the configurable timeouts and retries - update translations/tests to cover the new behaviour and ensure lint/mypy/pyright pipelines remain green ### ⚙️ Type of Change - [ ] 🐞 Bug fix (non-breaking change which fixes an issue) - [x] ✨ New feature (adds new functionality without breaking existing usage) - [ ] 💥 Breaking change (changes that might break existing user setups, scripts, or configurations) ## ✅ Checklist - [x] I have reviewed my changes to ensure they meet the project's standards. - [x] I have tested my changes and ensured that all tests pass (`pdm run test`). - [x] I have formatted the code (`pdm run format`). - [x] I have verified that linting passes (`pdm run lint`). - [x] I have updated documentation where necessary. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Centralized, configurable timeout system for web interactions, detection flows, publishing, and pagination. * Optional retry with exponential backoff for operations that time out. * **Improvements** * Replaced fixed wait times with dynamic timeouts throughout workflows. * More informative timeout-related messages and diagnostics. * **Tests** * New and expanded test coverage for timeout behavior, pagination, diagnostics, and retry logic. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -33,7 +33,7 @@ class AdExtractor(WebScrapingMixin):
|
||||
def __init__(self, browser:Browser, config:Config) -> None:
|
||||
super().__init__()
|
||||
self.browser = browser
|
||||
self.config = config
|
||||
self.config:Config = config
|
||||
|
||||
async def download_ad(self, ad_id:int) -> None:
|
||||
"""
|
||||
@@ -146,9 +146,10 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
# --- Pagination handling ---
|
||||
multi_page = False
|
||||
pagination_timeout = self._timeout("pagination_initial")
|
||||
try:
|
||||
# Correct selector: Use uppercase '.Pagination'
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 10) # Increased timeout slightly
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = pagination_timeout) # Increased timeout slightly
|
||||
# Correct selector: Use 'aria-label'
|
||||
# Also check if the button is actually present AND potentially enabled (though enabled check isn't strictly necessary here, only for clicking later)
|
||||
next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
@@ -204,9 +205,10 @@ class AdExtractor(WebScrapingMixin):
|
||||
break
|
||||
|
||||
# --- Navigate to next page ---
|
||||
follow_up_timeout = self._timeout("pagination_follow_up")
|
||||
try:
|
||||
# Find the pagination section again (scope might have changed after scroll/wait)
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = 5)
|
||||
pagination_section = await self.web_find(By.CSS_SELECTOR, ".Pagination", timeout = follow_up_timeout)
|
||||
# Find the "Next" button using the correct aria-label selector and ensure it's not disabled
|
||||
next_button_element = None
|
||||
possible_next_buttons = await self.web_find_all(By.CSS_SELECTOR, 'button[aria-label="Nächste"]', parent = pagination_section)
|
||||
@@ -432,8 +434,19 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
# Fallback to legacy selectors in case the breadcrumb structure is unexpected.
|
||||
LOG.debug(_("Falling back to legacy breadcrumb selectors; collected ids: %s"), category_ids)
|
||||
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
|
||||
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
|
||||
fallback_timeout = self._effective_timeout()
|
||||
try:
|
||||
category_first_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(2)", parent = category_line)
|
||||
category_second_part = await self.web_find(By.CSS_SELECTOR, "a:nth-of-type(3)", parent = category_line)
|
||||
except TimeoutError as exc:
|
||||
LOG.error(
|
||||
"Legacy breadcrumb selectors not found within %.1f seconds (collected ids: %s)",
|
||||
fallback_timeout,
|
||||
category_ids
|
||||
)
|
||||
raise TimeoutError(
|
||||
_("Unable to locate breadcrumb fallback selectors within %(seconds).1f seconds.") % {"seconds": fallback_timeout}
|
||||
) from exc
|
||||
href_first:str = str(category_first_part.attrs["href"])
|
||||
href_second:str = str(category_second_part.attrs["href"])
|
||||
cat_num_first_raw = href_first.rsplit("/", maxsplit = 1)[-1]
|
||||
|
||||
Reference in New Issue
Block a user