74 lines
2.1 KiB
Python
74 lines
2.1 KiB
Python
import sys
|
|
import time
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
from craft_storage import CraftStorage
|
|
import device_extractor
|
|
import simple_extractor
|
|
from loguru import logger
|
|
|
|
class CraftFacade:
|
|
|
|
def __init__(self):
|
|
self.visited: set[str] = set()
|
|
logger.add('file_{time}.log')
|
|
logger.add(sys.stdout, colorize=True, format="<green>{time}</green> <level>{message}</level>")
|
|
|
|
# ---------- low level ----------
|
|
|
|
def _get_page_content(self, url: str) -> str | None:
|
|
time.sleep(0.5)
|
|
try:
|
|
r = requests.get(url)
|
|
r.raise_for_status()
|
|
return r.text
|
|
except requests.RequestException as e:
|
|
logger.error(f'Ошибка загрузки {url}: {e}')
|
|
return None
|
|
|
|
def _get_soup(self, html: str) -> BeautifulSoup:
|
|
return BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
# ---------- public API ----------
|
|
|
|
def extract(self, url: str, storage: CraftStorage, recursive: bool = True):
|
|
if not url:
|
|
logger.error('Битая ссылка')
|
|
return
|
|
if url in self.visited:
|
|
logger.debug(f'Уже посещали: {url}')
|
|
return
|
|
|
|
html = self._get_page_content(url)
|
|
if not html:
|
|
return
|
|
|
|
soup = self._get_soup(html)
|
|
logger.info(f'Анализирую {url}')
|
|
|
|
self.visited.add(url)
|
|
|
|
all_links: set[str] = set()
|
|
|
|
try:
|
|
links = simple_extractor.extract_crafts(soup, storage)
|
|
all_links.update(links)
|
|
except Exception as e:
|
|
logger.error(f'Ошибка в простом парсере: {e}')
|
|
try:
|
|
links = device_extractor.extract_crafts(soup, storage)
|
|
all_links.update(all_links)
|
|
except Exception as e:
|
|
logger.error(f'Ошибка в машинном парсере: {e}')
|
|
|
|
logger.info(f'Готово: {url}')
|
|
|
|
if not recursive:
|
|
return
|
|
|
|
logger.info('Начинаю обход внешних ссылок...')
|
|
for link in all_links:
|
|
self.extract(link, storage, recursive=True) |