Updated forms. Added scrapers. Updated edit page for scrapers. Made new wishlist look better and added warning to save uuids somewhere safe. Updated nix flake.
This commit is contained in:
parent
1f269afce5
commit
13d63245ed
14 changed files with 367 additions and 109 deletions
162
app/scrapers.py
Normal file
162
app/scrapers.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, override
|
||||
from bs4 import BeautifulSoup
|
||||
from requests import get
|
||||
from re import findall, match, search
|
||||
|
||||
noReturnLambda: Callable[[str], str] = lambda x: x
|
||||
|
||||
|
||||
class ScrapeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ScraperResult:
|
||||
def __init__(self, name: str, price: float, image: str):
|
||||
self.name = name
|
||||
self.price = price
|
||||
self.image = image
|
||||
|
||||
@override
|
||||
def __repr__(self) -> str:
|
||||
return (
|
||||
f"<ScraperResult name:{self.name} price:{ self.price } image:{self.image}>"
|
||||
)
|
||||
|
||||
name: str
|
||||
price: float
|
||||
image: str
|
||||
|
||||
|
||||
class ScraperLike(ABC):
|
||||
name: str
|
||||
urlRegex: str
|
||||
|
||||
@abstractmethod
|
||||
def scrape(self, url: str) -> ScraperResult:
|
||||
pass
|
||||
|
||||
|
||||
class GenericScraper(ScraperLike):
|
||||
name: str
|
||||
urlRegex: str
|
||||
_nameQuery: str
|
||||
_priceQuery: str
|
||||
_imageQuery: str
|
||||
priceParser: Callable[[str], str]
|
||||
imageParser: Callable[[str], str]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
baseUrl: str,
|
||||
nameQuery: str,
|
||||
priceQuery: str,
|
||||
imageQuery: str,
|
||||
priceParser: Callable[[str], str] = noReturnLambda,
|
||||
imageParser: Callable[[str], str] = noReturnLambda,
|
||||
):
|
||||
self.name = name
|
||||
self.urlRegex = baseUrl
|
||||
self._nameQuery = nameQuery
|
||||
self._priceQuery = priceQuery
|
||||
self._imageQuery = imageQuery
|
||||
self.priceParser = priceParser
|
||||
self.imageParser = imageParser
|
||||
|
||||
@override
|
||||
def scrape(self, url: str) -> ScraperResult:
|
||||
res = get(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0"
|
||||
},
|
||||
)
|
||||
if res.status_code != 200:
|
||||
raise ScrapeError("Failed to fetch page.")
|
||||
|
||||
soup = BeautifulSoup(res.text, features="html.parser")
|
||||
name = soup.select_one(self._nameQuery)
|
||||
price = soup.select_one(self._priceQuery)
|
||||
image = soup.select_one(self._imageQuery)
|
||||
|
||||
if name is None or price is None or image is None:
|
||||
raise ScrapeError(
|
||||
f"Failed to scrape site. Invalid webpage or queries: N:{name},P:{price},I:{image}"
|
||||
)
|
||||
|
||||
name = name.text.strip()
|
||||
image = image.get("src")
|
||||
try:
|
||||
x = self.priceParser(price.text)
|
||||
reg = search(r"([0-9]+)(?:(?:\.|,)([0-9]+))?", x)
|
||||
if not reg:
|
||||
raise ValueError
|
||||
x = reg.group(1)
|
||||
|
||||
g2 = reg.group(2)
|
||||
if g2:
|
||||
x += "." + g2
|
||||
|
||||
price = float(x)
|
||||
except ValueError:
|
||||
print(price)
|
||||
raise ScrapeError(f"Failed to scrape site. Error while parsing price.")
|
||||
if not isinstance(image, str):
|
||||
raise ScrapeError(f"Failed to scrape site. Error while parsing image.")
|
||||
|
||||
return ScraperResult(name, price, self.imageParser(image))
|
||||
|
||||
|
||||
def scrapeSite(url: str) -> ScraperResult | None:
|
||||
scraped: ScraperResult | None = None
|
||||
|
||||
for i in scrapers:
|
||||
if match(i.urlRegex, url) is None:
|
||||
continue
|
||||
|
||||
scraped = i.scrape(url)
|
||||
|
||||
return scraped
|
||||
|
||||
|
||||
scrapers = [
|
||||
GenericScraper(
|
||||
"Amazon",
|
||||
r"^https?:\/\/(www\.)?((amazon)|(amzn))\.\w*",
|
||||
"#productTitle",
|
||||
"#corePrice_feature_div > div:nth-child(1) > div:nth-child(1) > span:nth-child(1) > span:nth-child(1)",
|
||||
"#landingImage",
|
||||
),
|
||||
GenericScraper(
|
||||
"Bol.com",
|
||||
r"^https?:\/\/(www\.)?bol.com",
|
||||
".page-heading > span:nth-child(1)",
|
||||
".promo-price",
|
||||
"div.container-item:nth-child(2) > wsp-selected-item-image-zoom-modal-application:nth-child(1) > button:nth-child(2) > img:nth-child(1)",
|
||||
priceParser=lambda x: x.replace("\n ", "."),
|
||||
),
|
||||
GenericScraper(
|
||||
"MediaMarkt",
|
||||
r"^https?:\/\/(www\.)?mediamarkt.\w*",
|
||||
"h1.sc-d571b66f-0",
|
||||
".sc-6db49389-0 > span:nth-child(2)",
|
||||
"div.sc-hLBbgP:nth-child(2) > div:nth-child(3) > ul:nth-child(1) > li:nth-child(1) > div:nth-child(1) > div:nth-child(1) > button:nth-child(1) > img:nth-child(1)",
|
||||
priceParser=lambda x: x.replace("€", ""),
|
||||
),
|
||||
GenericScraper(
|
||||
"Coolblue",
|
||||
r"^https?:\/\/(www\.)?coolblue.\w*",
|
||||
".css-1o2kclk",
|
||||
".css-puih25 > span:nth-child(1)",
|
||||
".css-ptvba5",
|
||||
),
|
||||
GenericScraper(
|
||||
"Megekko",
|
||||
r"^https?:\/\/(www\.)?megekko.nl",
|
||||
"#prd_title",
|
||||
"a.prsPrice:nth-child(1) > div:nth-child(1)",
|
||||
"#prd_afbeeldingen > div:nth-child(1) > img:nth-child(1)",
|
||||
imageParser=lambda x: f"https://www.megekko.nl/{x}",
|
||||
),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue