mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
Refactored category and special attribute (#550)
This commit is contained in:
@@ -855,8 +855,8 @@ class KleinanzeigenBot(WebScrapingMixin):
|
||||
"new_with_tag": "Neu mit Etikett",
|
||||
"new": "Neu",
|
||||
"like_new": "Sehr Gut",
|
||||
"alright": "Gut",
|
||||
"ok": "In Ordnung",
|
||||
"ok": "Gut",
|
||||
"alright": "In Ordnung",
|
||||
"defect": "Defekt",
|
||||
}
|
||||
mapped_condition = condition_mapping.get(condition_value)
|
||||
|
||||
@@ -277,7 +277,19 @@ class AdExtractor(WebScrapingMixin):
|
||||
title:str = await self.web_text(By.ID, "viewad-title")
|
||||
LOG.info('Extracting information from ad with title "%s"', title)
|
||||
|
||||
# contains info about ad in different dimensions in form of a key:value dict
|
||||
# dimension108 contains special attributes
|
||||
# dimension92 contains fake category, which becomes an special attribute on ad page
|
||||
belen_conf = await self.web_execute("window.BelenConf")
|
||||
|
||||
info["category"] = await self._extract_category_from_ad_page()
|
||||
|
||||
# append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
|
||||
# take subcategory from dimension92 as key 'art_s' sometimes is a special attribute (e.g. gender for clothes)
|
||||
# the subcategory isn't really necessary, but when set, the appropriate special attribute gets preselected
|
||||
if belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension92"]:
|
||||
info["category"] += f"/{belen_conf['universalAnalyticsOpts']['dimensions']['dimension92']}"
|
||||
|
||||
info["title"] = title
|
||||
|
||||
# Get raw description text
|
||||
@@ -296,11 +308,8 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
info["description"] = description_text.strip()
|
||||
|
||||
info["special_attributes"] = await self._extract_special_attributes_from_ad_page()
|
||||
if "art_s" in info["special_attributes"]:
|
||||
# change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
|
||||
info["category"] = f"{info['category']}/{info['special_attributes']['art_s']}"
|
||||
del info["special_attributes"]["art_s"]
|
||||
info["special_attributes"] = await self._extract_special_attributes_from_ad_page(belen_conf)
|
||||
|
||||
if "schaden_s" in info["special_attributes"]:
|
||||
# change f to 'nein' and 't' to 'ja'
|
||||
info["special_attributes"]["schaden_s"] = info["special_attributes"]["schaden_s"].translate(str.maketrans({"t": "ja", "f": "nein"}))
|
||||
@@ -347,18 +356,16 @@ class AdExtractor(WebScrapingMixin):
|
||||
|
||||
return category
|
||||
|
||||
async def _extract_special_attributes_from_ad_page(self) -> dict[str, Any]:
|
||||
async def _extract_special_attributes_from_ad_page(self, belen_conf:dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Extracts the special attributes from an ad page.
|
||||
If no items are available then special_attributes is empty
|
||||
|
||||
:return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
|
||||
"""
|
||||
belen_conf = await self.web_execute("window.BelenConf")
|
||||
|
||||
# e.g. "art_s:lautsprecher_kopfhoerer|condition_s:like_new|versand_s:t"
|
||||
special_attributes_str = belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension108"]
|
||||
|
||||
special_attributes = dict(item.split(":") for item in special_attributes_str.split("|") if ":" in item)
|
||||
special_attributes = {k: v for k, v in special_attributes.items() if not k.endswith(".versand_s") and k != "versand_s"}
|
||||
return special_attributes
|
||||
|
||||
@@ -433,6 +433,14 @@ class TestAdExtractorContent:
|
||||
raw_description, # Raw description (without affixes)
|
||||
"03.02.2025" # Creation date
|
||||
]),
|
||||
web_execute=AsyncMock(return_value={
|
||||
"universalAnalyticsOpts": {
|
||||
"dimensions": {
|
||||
"dimension92": "",
|
||||
"dimension108": ""
|
||||
}
|
||||
}
|
||||
}),
|
||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||
@@ -461,6 +469,14 @@ class TestAdExtractorContent:
|
||||
TimeoutError("Timeout"), # Description times out
|
||||
"03.02.2025" # Date succeeds
|
||||
]),
|
||||
web_execute=AsyncMock(return_value={
|
||||
"universalAnalyticsOpts": {
|
||||
"dimensions": {
|
||||
"dimension92": "",
|
||||
"dimension108": ""
|
||||
}
|
||||
}
|
||||
}),
|
||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||
@@ -494,6 +510,14 @@ class TestAdExtractorContent:
|
||||
raw_description, # Description without affixes
|
||||
"03.02.2025" # Creation date
|
||||
]),
|
||||
web_execute = AsyncMock(return_value = {
|
||||
"universalAnalyticsOpts": {
|
||||
"dimensions": {
|
||||
"dimension92": "",
|
||||
"dimension108": ""
|
||||
}
|
||||
}
|
||||
}),
|
||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||
@@ -575,9 +599,35 @@ class TestAdExtractorCategory:
|
||||
}
|
||||
}
|
||||
}
|
||||
result = await extractor._extract_special_attributes_from_ad_page()
|
||||
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
|
||||
assert result == {}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
# pylint: disable=protected-access
|
||||
async def test_extract_special_attributes_not_empty(self, extractor: AdExtractor) -> None:
|
||||
"""Test extraction of special attributes when not empty."""
|
||||
|
||||
special_atts = {
|
||||
"universalAnalyticsOpts": {
|
||||
"dimensions": {
|
||||
"dimension108": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"
|
||||
}
|
||||
}
|
||||
}
|
||||
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
|
||||
assert len(result) == 5
|
||||
assert "versand_s" not in result
|
||||
assert "color_s" in result
|
||||
assert result["color_s"] == "creme"
|
||||
assert "groesse_s" in result
|
||||
assert result["groesse_s"] == "68"
|
||||
assert "condition_s" in result
|
||||
assert result["condition_s"] == "alright"
|
||||
assert "type_s" in result
|
||||
assert result["type_s"] == "accessoires"
|
||||
assert "art_s" in result
|
||||
assert result["art_s"] == "maedchen"
|
||||
|
||||
|
||||
class TestAdExtractorContact:
|
||||
"""Tests for contact information extraction."""
|
||||
|
||||
Reference in New Issue
Block a user