162 lines
4.6 KiB
Python
162 lines
4.6 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import Callable, override
|
|
from bs4 import BeautifulSoup
|
|
from requests import get
|
|
from re import findall, match, search
|
|
|
|
noReturnLambda: Callable[[str], str] = lambda x: x
|
|
|
|
|
|
class ScrapeError(Exception):
|
|
pass
|
|
|
|
|
|
class ScraperResult:
|
|
def __init__(self, name: str, price: float, image: str):
|
|
self.name = name
|
|
self.price = price
|
|
self.image = image
|
|
|
|
@override
|
|
def __repr__(self) -> str:
|
|
return (
|
|
f"<ScraperResult name:{self.name} price:{ self.price } image:{self.image}>"
|
|
)
|
|
|
|
name: str
|
|
price: float
|
|
image: str
|
|
|
|
|
|
class ScraperLike(ABC):
|
|
name: str
|
|
urlRegex: str
|
|
|
|
@abstractmethod
|
|
def scrape(self, url: str) -> ScraperResult:
|
|
pass
|
|
|
|
|
|
class GenericScraper(ScraperLike):
|
|
name: str
|
|
urlRegex: str
|
|
_nameQuery: str
|
|
_priceQuery: str
|
|
_imageQuery: str
|
|
priceParser: Callable[[str], str]
|
|
imageParser: Callable[[str], str]
|
|
|
|
def __init__(
|
|
self,
|
|
name: str,
|
|
baseUrl: str,
|
|
nameQuery: str,
|
|
priceQuery: str,
|
|
imageQuery: str,
|
|
priceParser: Callable[[str], str] = noReturnLambda,
|
|
imageParser: Callable[[str], str] = noReturnLambda,
|
|
):
|
|
self.name = name
|
|
self.urlRegex = baseUrl
|
|
self._nameQuery = nameQuery
|
|
self._priceQuery = priceQuery
|
|
self._imageQuery = imageQuery
|
|
self.priceParser = priceParser
|
|
self.imageParser = imageParser
|
|
|
|
@override
|
|
def scrape(self, url: str) -> ScraperResult:
|
|
res = get(
|
|
url,
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0"
|
|
},
|
|
)
|
|
if res.status_code != 200:
|
|
raise ScrapeError("Failed to fetch page.")
|
|
|
|
soup = BeautifulSoup(res.text, features="html.parser")
|
|
name = soup.select_one(self._nameQuery)
|
|
price = soup.select_one(self._priceQuery)
|
|
image = soup.select_one(self._imageQuery)
|
|
|
|
if name is None or price is None or image is None:
|
|
raise ScrapeError(
|
|
f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}"
|
|
)
|
|
|
|
name = name.text.strip()
|
|
image = image.get("src")
|
|
try:
|
|
x = self.priceParser(price.text)
|
|
reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x)
|
|
if not reg:
|
|
raise ValueError
|
|
x = reg.group(1)
|
|
|
|
g2 = reg.group(2)
|
|
if g2:
|
|
x += "." + g2
|
|
|
|
price = float(x)
|
|
except ValueError:
|
|
print(price)
|
|
raise ScrapeError(f"Failed to scrape site. Error while parsing price.")
|
|
if not isinstance(image, str):
|
|
raise ScrapeError(f"Failed to scrape site. Error while parsing image.")
|
|
|
|
return ScraperResult(name, price, self.imageParser(image))
|
|
|
|
|
|
def scrapeSite(url: str) -> ScraperResult | None:
|
|
scraped: ScraperResult | None = None
|
|
|
|
for i in scrapers:
|
|
if match(i.urlRegex, url) is None:
|
|
continue
|
|
|
|
scraped = i.scrape(url)
|
|
|
|
return scraped
|
|
|
|
|
|
scrapers = [
|
|
GenericScraper(
|
|
"Amazon",
|
|
r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*",
|
|
"#productTitle",
|
|
"#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)",
|
|
"#landingImage",
|
|
),
|
|
GenericScraper(
|
|
"Bol.com",
|
|
r"^https?:\/\/(www\.)?bol.com",
|
|
".page-heading > span:nth-child(1)",
|
|
".promo-price",
|
|
"div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)",
|
|
priceParser=lambda x: x.replace("\n ", "."),
|
|
),
|
|
GenericScraper(
|
|
"MediaMarkt",
|
|
r"^https?:\/\/(www\.)?mediamarkt.\w*",
|
|
"h1.sc-d571b66f-0",
|
|
".sc-6db49389-0 > span:nth-child(2)",
|
|
"div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)",
|
|
priceParser=lambda x: x.replace("€", ""),
|
|
),
|
|
GenericScraper(
|
|
"Coolblue",
|
|
r"^https?:\/\/(www\.)?coolblue.\w*",
|
|
".css-1o2kclk",
|
|
".css-puih25 > span:nth-child(1)",
|
|
".css-ptvba5",
|
|
),
|
|
GenericScraper(
|
|
"Megekko",
|
|
r"^https?:\/\/(www\.)?megekko.nl",
|
|
"#prd_title",
|
|
"a.prsPrice:nth-child(1) > div:nth-child(1)",
|
|
"#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)",
|
|
imageParser=lambda x: f"https://www.megekko.nl/{x}",
|
|
),
|
|
]
|