mirror of
https://github.com/Second-Hand-Friends/kleinanzeigen-bot.git
synced 2026-03-12 02:31:45 +01:00
Refactored category and special attribute (#550)
This commit is contained in:
@@ -855,8 +855,8 @@ class KleinanzeigenBot(WebScrapingMixin):
|
|||||||
"new_with_tag": "Neu mit Etikett",
|
"new_with_tag": "Neu mit Etikett",
|
||||||
"new": "Neu",
|
"new": "Neu",
|
||||||
"like_new": "Sehr Gut",
|
"like_new": "Sehr Gut",
|
||||||
"alright": "Gut",
|
"ok": "Gut",
|
||||||
"ok": "In Ordnung",
|
"alright": "In Ordnung",
|
||||||
"defect": "Defekt",
|
"defect": "Defekt",
|
||||||
}
|
}
|
||||||
mapped_condition = condition_mapping.get(condition_value)
|
mapped_condition = condition_mapping.get(condition_value)
|
||||||
|
|||||||
@@ -277,7 +277,19 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
title:str = await self.web_text(By.ID, "viewad-title")
|
title:str = await self.web_text(By.ID, "viewad-title")
|
||||||
LOG.info('Extracting information from ad with title "%s"', title)
|
LOG.info('Extracting information from ad with title "%s"', title)
|
||||||
|
|
||||||
|
# contains info about ad in different dimensions in form of a key:value dict
|
||||||
|
# dimension108 contains special attributes
|
||||||
|
# dimension92 contains fake category, which becomes an special attribute on ad page
|
||||||
|
belen_conf = await self.web_execute("window.BelenConf")
|
||||||
|
|
||||||
info["category"] = await self._extract_category_from_ad_page()
|
info["category"] = await self._extract_category_from_ad_page()
|
||||||
|
|
||||||
|
# append subcategory and change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
|
||||||
|
# take subcategory from dimension92 as key 'art_s' sometimes is a special attribute (e.g. gender for clothes)
|
||||||
|
# the subcategory isn't really necessary, but when set, the appropriate special attribute gets preselected
|
||||||
|
if belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension92"]:
|
||||||
|
info["category"] += f"/{belen_conf['universalAnalyticsOpts']['dimensions']['dimension92']}"
|
||||||
|
|
||||||
info["title"] = title
|
info["title"] = title
|
||||||
|
|
||||||
# Get raw description text
|
# Get raw description text
|
||||||
@@ -296,11 +308,8 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
|
|
||||||
info["description"] = description_text.strip()
|
info["description"] = description_text.strip()
|
||||||
|
|
||||||
info["special_attributes"] = await self._extract_special_attributes_from_ad_page()
|
info["special_attributes"] = await self._extract_special_attributes_from_ad_page(belen_conf)
|
||||||
if "art_s" in info["special_attributes"]:
|
|
||||||
# change e.g. category "161/172" to "161/172/lautsprecher_kopfhoerer"
|
|
||||||
info["category"] = f"{info['category']}/{info['special_attributes']['art_s']}"
|
|
||||||
del info["special_attributes"]["art_s"]
|
|
||||||
if "schaden_s" in info["special_attributes"]:
|
if "schaden_s" in info["special_attributes"]:
|
||||||
# change f to 'nein' and 't' to 'ja'
|
# change f to 'nein' and 't' to 'ja'
|
||||||
info["special_attributes"]["schaden_s"] = info["special_attributes"]["schaden_s"].translate(str.maketrans({"t": "ja", "f": "nein"}))
|
info["special_attributes"]["schaden_s"] = info["special_attributes"]["schaden_s"].translate(str.maketrans({"t": "ja", "f": "nein"}))
|
||||||
@@ -347,18 +356,16 @@ class AdExtractor(WebScrapingMixin):
|
|||||||
|
|
||||||
return category
|
return category
|
||||||
|
|
||||||
async def _extract_special_attributes_from_ad_page(self) -> dict[str, Any]:
|
async def _extract_special_attributes_from_ad_page(self, belen_conf:dict[str, Any]) -> dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Extracts the special attributes from an ad page.
|
Extracts the special attributes from an ad page.
|
||||||
If no items are available then special_attributes is empty
|
If no items are available then special_attributes is empty
|
||||||
|
|
||||||
:return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
|
:return: a dictionary (possibly empty) where the keys are the attribute names, mapped to their values
|
||||||
"""
|
"""
|
||||||
belen_conf = await self.web_execute("window.BelenConf")
|
|
||||||
|
|
||||||
# e.g. "art_s:lautsprecher_kopfhoerer|condition_s:like_new|versand_s:t"
|
# e.g. "art_s:lautsprecher_kopfhoerer|condition_s:like_new|versand_s:t"
|
||||||
special_attributes_str = belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension108"]
|
special_attributes_str = belen_conf["universalAnalyticsOpts"]["dimensions"]["dimension108"]
|
||||||
|
|
||||||
special_attributes = dict(item.split(":") for item in special_attributes_str.split("|") if ":" in item)
|
special_attributes = dict(item.split(":") for item in special_attributes_str.split("|") if ":" in item)
|
||||||
special_attributes = {k: v for k, v in special_attributes.items() if not k.endswith(".versand_s") and k != "versand_s"}
|
special_attributes = {k: v for k, v in special_attributes.items() if not k.endswith(".versand_s") and k != "versand_s"}
|
||||||
return special_attributes
|
return special_attributes
|
||||||
|
|||||||
@@ -433,6 +433,14 @@ class TestAdExtractorContent:
|
|||||||
raw_description, # Raw description (without affixes)
|
raw_description, # Raw description (without affixes)
|
||||||
"03.02.2025" # Creation date
|
"03.02.2025" # Creation date
|
||||||
]),
|
]),
|
||||||
|
web_execute=AsyncMock(return_value={
|
||||||
|
"universalAnalyticsOpts": {
|
||||||
|
"dimensions": {
|
||||||
|
"dimension92": "",
|
||||||
|
"dimension108": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||||
@@ -461,6 +469,14 @@ class TestAdExtractorContent:
|
|||||||
TimeoutError("Timeout"), # Description times out
|
TimeoutError("Timeout"), # Description times out
|
||||||
"03.02.2025" # Date succeeds
|
"03.02.2025" # Date succeeds
|
||||||
]),
|
]),
|
||||||
|
web_execute=AsyncMock(return_value={
|
||||||
|
"universalAnalyticsOpts": {
|
||||||
|
"dimensions": {
|
||||||
|
"dimension92": "",
|
||||||
|
"dimension108": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||||
@@ -494,6 +510,14 @@ class TestAdExtractorContent:
|
|||||||
raw_description, # Description without affixes
|
raw_description, # Description without affixes
|
||||||
"03.02.2025" # Creation date
|
"03.02.2025" # Creation date
|
||||||
]),
|
]),
|
||||||
|
web_execute = AsyncMock(return_value = {
|
||||||
|
"universalAnalyticsOpts": {
|
||||||
|
"dimensions": {
|
||||||
|
"dimension92": "",
|
||||||
|
"dimension108": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}),
|
||||||
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
_extract_category_from_ad_page = AsyncMock(return_value = "160"),
|
||||||
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
_extract_special_attributes_from_ad_page = AsyncMock(return_value = {}),
|
||||||
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
_extract_pricing_info_from_ad_page = AsyncMock(return_value = (None, "NOT_APPLICABLE")),
|
||||||
@@ -575,9 +599,35 @@ class TestAdExtractorCategory:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = await extractor._extract_special_attributes_from_ad_page()
|
result = await extractor._extract_special_attributes_from_ad_page(mock_web_execute.return_value)
|
||||||
assert result == {}
|
assert result == {}
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
async def test_extract_special_attributes_not_empty(self, extractor: AdExtractor) -> None:
|
||||||
|
"""Test extraction of special attributes when not empty."""
|
||||||
|
|
||||||
|
special_atts = {
|
||||||
|
"universalAnalyticsOpts": {
|
||||||
|
"dimensions": {
|
||||||
|
"dimension108": "versand_s:t|color_s:creme|groesse_s:68|condition_s:alright|type_s:accessoires|art_s:maedchen"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = await extractor._extract_special_attributes_from_ad_page(special_atts)
|
||||||
|
assert len(result) == 5
|
||||||
|
assert "versand_s" not in result
|
||||||
|
assert "color_s" in result
|
||||||
|
assert result["color_s"] == "creme"
|
||||||
|
assert "groesse_s" in result
|
||||||
|
assert result["groesse_s"] == "68"
|
||||||
|
assert "condition_s" in result
|
||||||
|
assert result["condition_s"] == "alright"
|
||||||
|
assert "type_s" in result
|
||||||
|
assert result["type_s"] == "accessoires"
|
||||||
|
assert "art_s" in result
|
||||||
|
assert result["art_s"] == "maedchen"
|
||||||
|
|
||||||
|
|
||||||
class TestAdExtractorContact:
|
class TestAdExtractorContact:
|
||||||
"""Tests for contact information extraction."""
|
"""Tests for contact information extraction."""
|
||||||
|
|||||||
Reference in New Issue
Block a user