import sys import time from bs4 import BeautifulSoup import requests from craft_storage import CraftStorage import device_extractor import simple_extractor from loguru import logger class CraftFacade: def __init__(self): self.visited: set[str] = set() logger.add('file_{time}.log') logger.add(sys.stdout, colorize=True, format="{time} {message}") # ---------- low level ---------- def _get_page_content(self, url: str) -> str | None: time.sleep(0.5) try: r = requests.get(url) r.raise_for_status() return r.text except requests.RequestException as e: logger.error(f'Ошибка загрузки {url}: {e}') return None def _get_soup(self, html: str) -> BeautifulSoup: return BeautifulSoup(html, "html.parser") # ---------- public API ---------- def extract(self, url: str, storage: CraftStorage, recursive: bool = True): if not url: logger.error('Битая ссылка') return if url in self.visited: logger.debug(f'Уже посещали: {url}') return html = self._get_page_content(url) if not html: return soup = self._get_soup(html) logger.info(f'Анализирую {url}') self.visited.add(url) all_links: set[str] = set() try: links = simple_extractor.extract_crafts(soup, storage) all_links.update(links) links = device_extractor.extract_crafts(soup, storage) all_links.update(all_links) except Exception as e: logger.error(f'Ошибка в парсере: {e}') logger.info(f'Готово: {url}') if not recursive: return logger.info('Начинаю обход внешних ссылок...') for link in all_links: self.extract(link, storage, recursive=True)