Files
CraftCalc/craft_parser/craft_facade.py
2026-01-19 22:17:38 +07:00

71 lines
2.0 KiB
Python

import sys
import time
from bs4 import BeautifulSoup
import requests
from craft_storage import CraftStorage
import device_extractor
import simple_extractor
from loguru import logger
class CraftFacade:
def __init__(self):
self.visited: set[str] = set()
logger.add('file_{time}.log')
logger.add(sys.stdout, colorize=True, format="<green>{time}</green> <level>{message}</level>")
# ---------- low level ----------
def _get_page_content(self, url: str) -> str | None:
time.sleep(0.5)
try:
r = requests.get(url)
r.raise_for_status()
return r.text
except requests.RequestException as e:
logger.error(f'Ошибка загрузки {url}: {e}')
return None
def _get_soup(self, html: str) -> BeautifulSoup:
return BeautifulSoup(html, "html.parser")
# ---------- public API ----------
def extract(self, url: str, storage: CraftStorage, recursive: bool = True):
if not url:
logger.error('Битая ссылка')
return
if url in self.visited:
logger.debug(f'Уже посещали: {url}')
return
html = self._get_page_content(url)
if not html:
return
soup = self._get_soup(html)
logger.info(f'Анализирую {url}')
self.visited.add(url)
all_links: set[str] = set()
try:
links = simple_extractor.extract_crafts(soup, storage)
all_links.update(links)
links = device_extractor.extract_crafts(soup, storage)
all_links.update(all_links)
except Exception as e:
logger.error(f'Ошибка в парсере: {e}')
logger.info(f'Готово: {url}')
if not recursive:
return
logger.info('Начинаю обход внешних ссылок...')
for link in all_links:
self.extract(link, storage, recursive=True)