flask_eindproject/app/scrapers.py

162 lines
4.6 KiB
Python

from abc import ABC, abstractmethod
from typing import Callable, override
from bs4 import BeautifulSoup
from requests import get
from re import findall, match, search
noReturnLambda: Callable[[str], str] = lambda x: x
class ScrapeError(Exception):
pass
class ScraperResult:
def __init__(self, name: str, price: float, image: str):
self.name = name
self.price = price
self.image = image
@override
def __repr__(self) -> str:
return (
f"<ScraperResult name:{self.name} price:{ self.price } image:{self.image}>"
)
name: str
price: float
image: str
class ScraperLike(ABC):
name: str
urlRegex: str
@abstractmethod
def scrape(self, url: str) -> ScraperResult:
pass
class GenericScraper(ScraperLike):
name: str
urlRegex: str
_nameQuery: str
_priceQuery: str
_imageQuery: str
priceParser: Callable[[str], str]
imageParser: Callable[[str], str]
def __init__(
self,
name: str,
baseUrl: str,
nameQuery: str,
priceQuery: str,
imageQuery: str,
priceParser: Callable[[str], str] = noReturnLambda,
imageParser: Callable[[str], str] = noReturnLambda,
):
self.name = name
self.urlRegex = baseUrl
self._nameQuery = nameQuery
self._priceQuery = priceQuery
self._imageQuery = imageQuery
self.priceParser = priceParser
self.imageParser = imageParser
@override
def scrape(self, url: str) -> ScraperResult:
res = get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0"
},
)
if res.status_code != 200:
raise ScrapeError("Failed to fetch page.")
soup = BeautifulSoup(res.text, features="html.parser")
name = soup.select_one(self._nameQuery)
price = soup.select_one(self._priceQuery)
image = soup.select_one(self._imageQuery)
if name is None or price is None or image is None:
raise ScrapeError(
f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}"
)
name = name.text.strip()
image = image.get("src")
try:
x = self.priceParser(price.text)
reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x)
if not reg:
raise ValueError
x = reg.group(1)
g2 = reg.group(2)
if g2:
x += "." + g2
price = float(x)
except ValueError:
print(price)
raise ScrapeError(f"Failed to scrape site. Error while parsing price.")
if not isinstance(image, str):
raise ScrapeError(f"Failed to scrape site. Error while parsing image.")
return ScraperResult(name, price, self.imageParser(image))
def scrapeSite(url: str) -> ScraperResult | None:
scraped: ScraperResult | None = None
for i in scrapers:
if match(i.urlRegex, url) is None:
continue
scraped = i.scrape(url)
return scraped
scrapers = [
GenericScraper(
"Amazon",
r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*",
"#productTitle",
"#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)",
"#landingImage",
),
GenericScraper(
"Bol.com",
r"^https?:\/\/(www\.)?bol.com",
".page-heading > span:nth-child(1)",
".promo-price",
"div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)",
priceParser=lambda x: x.replace("\n ", "."),
),
GenericScraper(
"MediaMarkt",
r"^https?:\/\/(www\.)?mediamarkt.\w*",
"h1.sc-d571b66f-0",
".sc-6db49389-0 > span:nth-child(2)",
"div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)",
priceParser=lambda x: x.replace("", ""),
),
GenericScraper(
"Coolblue",
r"^https?:\/\/(www\.)?coolblue.\w*",
".css-1o2kclk",
".css-puih25 > span:nth-child(1)",
".css-ptvba5",
),
GenericScraper(
"Megekko",
r"^https?:\/\/(www\.)?megekko.nl",
"#prd_title",
"a.prsPrice:nth-child(1) > div:nth-child(1)",
"#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)",
imageParser=lambda x: f"https://www.megekko.nl/{x}",
),
]