Updated forms. Added scrapers. Updated edit page for scrapers. Made new wishlist look better and added warning to save uuids somewhere safe. Updated nix flake.

2025-06-22 19:42:20 +02:00 · 2025-06-22 19:42:20 +02:00 · 13d63245ed
commit 13d63245ed
parent 1f269afce5
14 changed files with 367 additions and 109 deletions
--- a/app/scrapers.py
+++ b/app/scrapers.py
@ -0,0 +1,162 @@
+from abc import ABC, abstractmethod
+from typing import Callable, override
+from bs4 import BeautifulSoup
+from requests import get
+from re import findall, match, search
+
+noReturnLambda: Callable[[str], str] = lambda x: x
+
+
+class ScrapeError(Exception):
+    pass
+
+
+class ScraperResult:
+    def __init__(self, name: str, price: float, image: str):
+        self.name = name
+        self.price = price
+        self.image = image
+
+    @override
+    def __repr__(self) -> str:
+        return (
+            f"<ScraperResult name:{self.name} price:{ self.price } image:{self.image}>"
+        )
+
+    name: str
+    price: float
+    image: str
+
+
+class ScraperLike(ABC):
+    name: str
+    urlRegex: str
+
+    @abstractmethod
+    def scrape(self, url: str) -> ScraperResult:
+        pass
+
+
+class GenericScraper(ScraperLike):
+    name: str
+    urlRegex: str
+    _nameQuery: str
+    _priceQuery: str
+    _imageQuery: str
+    priceParser: Callable[[str], str]
+    imageParser: Callable[[str], str]
+
+    def __init__(
+        self,
+        name: str,
+        baseUrl: str,
+        nameQuery: str,
+        priceQuery: str,
+        imageQuery: str,
+        priceParser: Callable[[str], str] = noReturnLambda,
+        imageParser: Callable[[str], str] = noReturnLambda,
+    ):
+        self.name = name
+        self.urlRegex = baseUrl
+        self._nameQuery = nameQuery
+        self._priceQuery = priceQuery
+        self._imageQuery = imageQuery
+        self.priceParser = priceParser
+        self.imageParser = imageParser
+
+    @override
+    def scrape(self, url: str) -> ScraperResult:
+        res = get(
+            url,
+            headers={
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0"
+            },
+        )
+        if res.status_code != 200:
+            raise ScrapeError("Failed to fetch page.")
+
+        soup = BeautifulSoup(res.text, features="html.parser")
+        name = soup.select_one(self._nameQuery)
+        price = soup.select_one(self._priceQuery)
+        image = soup.select_one(self._imageQuery)
+
+        if name is None or price is None or image is None:
+            raise ScrapeError(
+                f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}"
+            )
+
+        name = name.text.strip()
+        image = image.get("src")
+        try:
+            x = self.priceParser(price.text)
+            reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x)
+            if not reg:
+                raise ValueError
+            x = reg.group(1)
+
+            g2 = reg.group(2)
+            if g2:
+                x += "." + g2
+
+            price = float(x)
+        except ValueError:
+            print(price)
+            raise ScrapeError(f"Failed to scrape site. Error while parsing price.")
+        if not isinstance(image, str):
+            raise ScrapeError(f"Failed to scrape site. Error while parsing image.")
+
+        return ScraperResult(name, price, self.imageParser(image))
+
+
+def scrapeSite(url: str) -> ScraperResult | None:
+    scraped: ScraperResult | None = None
+
+    for i in scrapers:
+        if match(i.urlRegex, url) is None:
+            continue
+
+        scraped = i.scrape(url)
+
+    return scraped
+
+
+scrapers = [
+    GenericScraper(
+        "Amazon",
+        r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*",
+        "#productTitle",
+        "#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)",
+        "#landingImage",
+    ),
+    GenericScraper(
+        "Bol.com",
+        r"^https?:\/\/(www\.)?bol.com",
+        ".page-heading > span:nth-child(1)",
+        ".promo-price",
+        "div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)",
+        priceParser=lambda x: x.replace("\n  ", "."),
+    ),
+    GenericScraper(
+        "MediaMarkt",
+        r"^https?:\/\/(www\.)?mediamarkt.\w*",
+        "h1.sc-d571b66f-0",
+        ".sc-6db49389-0 > span:nth-child(2)",
+        "div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)",
+        priceParser=lambda x: x.replace("€", ""),
+    ),
+    GenericScraper(
+        "Coolblue",
+        r"^https?:\/\/(www\.)?coolblue.\w*",
+        ".css-1o2kclk",
+        ".css-puih25 > span:nth-child(1)",
+        ".css-ptvba5",
+    ),
+    GenericScraper(
+        "Megekko",
+        r"^https?:\/\/(www\.)?megekko.nl",
+        "#prd_title",
+        "a.prsPrice:nth-child(1) > div:nth-child(1)",
+        "#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)",
+        imageParser=lambda x: f"https://www.megekko.nl/{x}",
+    ),
+]