Refactored category and special attribute (#550)

This commit is contained in:
Heavenfighter
2025-06-12 14:08:06 +02:00
committed by GitHub
parent 86140c77f8
commit 0305a10eae
3 changed files with 68 additions and 11 deletions

View File

@@ -855,8 +855,8 @@ class KleinanzeigenBot(WebScrapingMixin):
"new_with_tag": "Neu mit Etikett",
"new": "Neu",
"like_new": "Sehr Gut",
"alright": "Gut",
"ok": "In Ordnung",
"ok": "Gut",
"alright": "In Ordnung",
"defect": "Defekt",
}
mapped_condition = condition_mapping.get(condition_value)

View File

@@ -277,7 +277,19 @@ class AdExtractor(WebScrapingMixin):
title:str = await self.web_text(By.ID, "viewad-title")
LOG.info('Extracting information from ad with title "%s"', title)
# contains info about ad in different dimensions in form of a key:value dict
# dimension108 contains special attributes
# dimension92 contains fake category, which becomes an special attribute on ad page
belen_conf = await self.web_execute("window.BelenConf")
info["category"] = await self._extract_category_from_ad_page()
# append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
# take subcategory from dimension92 as key 'art_s' sometimes is a special attribute (e.g. gender for clothes)
# the subcategory isn't really necessary, but when set, the appropriate special attribute gets preselected
if belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension92"]:
info["category"] += f"/{belen_conf['universalAnalyticsOpts']['dimensions']['dimension92']}"
info["title"] = title
# Get raw description text
@@ -296,11 +308,8 @@ class AdExtractor(WebScrapingMixin):
info["description"] = description_text.strip()
info["special_attributes"] = await self._extract_special_attributes_from_ad_page()
if "art_s" in info["special_attributes"]:
# change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
info["category"] = f"{info['category']}/{info['special_attributes']['art_s']}"
del info["special_attributes"]["art_s"]
info["special_attributes"] = await self._extract_special_attributes_from_ad_page(belen_conf)
if "schaden_s" in info["special_attributes"]:
# change f to 'nein' and 't' to 'ja'
info["special_attributes"]["schaden_s"] = info["special_attributes"]["schaden_s"].translate(str.maketrans({"t": "ja", "f": "nein"}))
@@ -347,18 +356,16 @@ class AdExtractor(WebScrapingMixin):
return category
async def _extract_special_attributes_from_ad_page(self) -> dict[str, Any]:
async def _extract_special_attributes_from_ad_page(self, belen_conf:dict[str, Any]) -> dict[str, Any]:
"""
Extracts the special attributes from an ad page.
If no items are available then special_attributes is empty
:return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
"""
belen_conf = await self.web_execute("window.BelenConf")
# e.g. "art_s:lautsprecher_kopfhoerer|condition_s:like_new|versand_s:t"
special_attributes_str = belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension108"]
special_attributes = dict(item.split(":") for item in special_attributes_str.split("|") if ":" in item)
special_attributes = {k: v for k, v in special_attributes.items() if not k.endswith(".versand_s") and k != "versand_s"}
return special_attributes

View File

@@ -433,6 +433,14 @@ class TestAdExtractorContent:
raw_description, # Raw description (without affixes)
"03.02.2025" # Creation date
]),
web_execute=AsyncMock(return_value={
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
@@ -461,6 +469,14 @@ class TestAdExtractorContent:
TimeoutError("Timeout"), # Description times out
"03.02.2025" # Date succeeds
]),
web_execute=AsyncMock(return_value={
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
@@ -494,6 +510,14 @@ class TestAdExtractorContent:
raw_description, # Description without affixes
"03.02.2025" # Creation date
]),
web_execute = AsyncMock(return_value = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension92": "",
"dimension108": ""
}
}
}),
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
@@ -575,9 +599,35 @@ class TestAdExtractorCategory:
}
}
}
result = await extractor._extract_special_attributes_from_ad_page()
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
assert result == {}
@pytest.mark.asyncio
# pylint: disable=protected-access
async def test_extract_special_attributes_not_empty(self, extractor: AdExtractor) -> None:
"""Test extraction of special attributes when not empty."""
special_atts = {
"universalAnalyticsOpts": {
"dimensions": {
"dimension108": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"
}
}
}
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
assert len(result) == 5
assert "versand_s" not in result
assert "color_s" in result
assert result["color_s"] == "creme"
assert "groesse_s" in result
assert result["groesse_s"] == "68"
assert "condition_s" in result
assert result["condition_s"] == "alright"
assert "type_s" in result
assert result["type_s"] == "accessoires"
assert "art_s" in result
assert result["art_s"] == "maedchen"
class TestAdExtractorContact:
"""Tests for contact information extraction."""