from abc import ABC, abstractmethod from typing import Callable, override from bs4 import BeautifulSoup from requests import get from re import findall, match, search noReturnLambda: Callable[[str], str] = lambda x: x class ScrapeError(Exception): pass class ScraperResult: def __init__(self, name: str, price: float, image: str): self.name = name self.price = price self.image = image @override def __repr__(self) -> str: return ( f"" ) name: str price: float image: str class ScraperLike(ABC): name: str urlRegex: str @abstractmethod def scrape(self, url: str) -> ScraperResult: pass class GenericScraper(ScraperLike): name: str urlRegex: str _nameQuery: str _priceQuery: str _imageQuery: str priceParser: Callable[[str], str] imageParser: Callable[[str], str] def __init__( self, name: str, baseUrl: str, nameQuery: str, priceQuery: str, imageQuery: str, priceParser: Callable[[str], str] = noReturnLambda, imageParser: Callable[[str], str] = noReturnLambda, ): self.name = name self.urlRegex = baseUrl self._nameQuery = nameQuery self._priceQuery = priceQuery self._imageQuery = imageQuery self.priceParser = priceParser self.imageParser = imageParser @override def scrape(self, url: str) -> ScraperResult: res = get( url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0" }, ) if res.status_code != 200: raise ScrapeError("Failed to fetch page.") soup = BeautifulSoup(res.text, features="html.parser") name = soup.select_one(self._nameQuery) price = soup.select_one(self._priceQuery) image = soup.select_one(self._imageQuery) if name is None or image is None: raise ScrapeError( f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}" ) name = name.text.strip() image = image.get("src") try: if price is None: price = "0" else: price = price.text x = self.priceParser(price) reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x) if not reg: raise ValueError x = reg.group(1) g2 = reg.group(2) if g2: x += "." + g2 price = float(x) except ValueError: print(price) raise ScrapeError(f"Failed to scrape site. Error while parsing price.") if not isinstance(image, str): raise ScrapeError(f"Failed to scrape site. Error while parsing image.") return ScraperResult(name, price, self.imageParser(image)) def scrapeSite(url: str) -> ScraperResult | None: scraped: ScraperResult | None = None for i in scrapers: if match(i.urlRegex, url) is None: continue scraped = i.scrape(url) return scraped scrapers = [ GenericScraper( "Amazon", r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*", "#productTitle", "#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)", "#landingImage", ), GenericScraper( "Bol.com", r"^https?:\/\/(www\.)?bol.com", ".page-heading > span:nth-child(1)", ".promo-price", "div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)", priceParser=lambda x: x.replace("\n ", "."), ), GenericScraper( "MediaMarkt", r"^https?:\/\/(www\.)?mediamarkt.\w*", "h1.sc-d571b66f-0", ".sc-6db49389-0 > span:nth-child(2)", "div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)", priceParser=lambda x: x.replace("€", ""), ), GenericScraper( "Coolblue", r"^https?:\/\/(www\.)?coolblue.\w*", ".css-1o2kclk", ".css-puih25 > span:nth-child(1)", ".css-ptvba5", ), GenericScraper( "Megekko", r"^https?:\/\/(www\.)?megekko.nl", "#prd_title", "a.prsPrice:nth-child(1) > div:nth-child(1)", "#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)", imageParser=lambda x: f"https://www.megekko.nl/{x}", ), ]