flask_eindproject/app/scrapers.py

from abc import ABC, abstractmethod
from typing import Callable, override
from bs4 import BeautifulSoup
from requests import get
from re import findall, match, search

noReturnLambda: Callable[[str], str] = lambda x: x


class ScrapeError(Exception):
    pass


class ScraperResult:
    def __init__(self, name: str, price: float, image: str):
        self.name = name
        self.price = price
        self.image = image

    @override
    def __repr__(self) -> str:
        return (
            f"<ScraperResult name:{self.name} price:{ self.price } image:{self.image}>"
        )

    name: str
    price: float
    image: str


class ScraperLike(ABC):
    name: str
    urlRegex: str

    @abstractmethod
    def scrape(self, url: str) -> ScraperResult:
        pass


class GenericScraper(ScraperLike):
    name: str
    urlRegex: str
    _nameQuery: str
    _priceQuery: str
    _imageQuery: str
    priceParser: Callable[[str], str]
    imageParser: Callable[[str], str]

    def __init__(
        self,
        name: str,
        baseUrl: str,
        nameQuery: str,
        priceQuery: str,
        imageQuery: str,
        priceParser: Callable[[str], str] = noReturnLambda,
        imageParser: Callable[[str], str] = noReturnLambda,
    ):
        self.name = name
        self.urlRegex = baseUrl
        self._nameQuery = nameQuery
        self._priceQuery = priceQuery
        self._imageQuery = imageQuery
        self.priceParser = priceParser
        self.imageParser = imageParser

    @override
    def scrape(self, url: str) -> ScraperResult:
        res = get(
            url,
            headers={
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0"
            },
        )
        if res.status_code != 200:
            raise ScrapeError("Failed to fetch page.")

        soup = BeautifulSoup(res.text, features="html.parser")
        name = soup.select_one(self._nameQuery)
        price = soup.select_one(self._priceQuery)
        image = soup.select_one(self._imageQuery)

        if name is None or price is None or image is None:
            raise ScrapeError(
                f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}"
            )

        name = name.text.strip()
        image = image.get("src")
        try:
            x = self.priceParser(price.text)
            reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x)
            if not reg:
                raise ValueError
            x = reg.group(1)

            g2 = reg.group(2)
            if g2:
                x += "." + g2

            price = float(x)
        except ValueError:
            print(price)
            raise ScrapeError(f"Failed to scrape site. Error while parsing price.")
        if not isinstance(image, str):
            raise ScrapeError(f"Failed to scrape site. Error while parsing image.")

        return ScraperResult(name, price, self.imageParser(image))


def scrapeSite(url: str) -> ScraperResult | None:
    scraped: ScraperResult | None = None

    for i in scrapers:
        if match(i.urlRegex, url) is None:
            continue

        scraped = i.scrape(url)

    return scraped


scrapers = [
    GenericScraper(
        "Amazon",
        r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*",
        "#productTitle",
        "#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)",
        "#landingImage",
    ),
    GenericScraper(
        "Bol.com",
        r"^https?:\/\/(www\.)?bol.com",
        ".page-heading > span:nth-child(1)",
        ".promo-price",
        "div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)",
        priceParser=lambda x: x.replace("\n  ", "."),
    ),
    GenericScraper(
        "MediaMarkt",
        r"^https?:\/\/(www\.)?mediamarkt.\w*",
        "h1.sc-d571b66f-0",
        ".sc-6db49389-0 > span:nth-child(2)",
        "div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)",
        priceParser=lambda x: x.replace("€", ""),
    ),
    GenericScraper(
        "Coolblue",
        r"^https?:\/\/(www\.)?coolblue.\w*",
        ".css-1o2kclk",
        ".css-puih25 > span:nth-child(1)",
        ".css-ptvba5",
    ),
    GenericScraper(
        "Megekko",
        r"^https?:\/\/(www\.)?megekko.nl",
        "#prd_title",
        "a.prsPrice:nth-child(1) > div:nth-child(1)",
        "#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)",
        imageParser=lambda x: f"https://www.megekko.nl/{x}",
    ),
]