craft_parser_init
This commit is contained in:
71
craft_parser/craft_facade.py
Normal file
71
craft_parser/craft_facade.py
Normal file
@ -0,0 +1,71 @@
|
||||
import sys
|
||||
import time
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
from craft_storage import CraftStorage
|
||||
import device_extractor
|
||||
import simple_extractor
|
||||
from loguru import logger
|
||||
|
||||
class CraftFacade:
|
||||
|
||||
def __init__(self):
|
||||
self.visited: set[str] = set()
|
||||
logger.add('file_{time}.log')
|
||||
logger.add(sys.stdout, colorize=True, format="<green>{time}</green> <level>{message}</level>")
|
||||
|
||||
# ---------- low level ----------
|
||||
|
||||
def _get_page_content(self, url: str) -> str | None:
|
||||
time.sleep(0.5)
|
||||
try:
|
||||
r = requests.get(url)
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except requests.RequestException as e:
|
||||
logger.error(f'Ошибка загрузки {url}: {e}')
|
||||
return None
|
||||
|
||||
def _get_soup(self, html: str) -> BeautifulSoup:
|
||||
return BeautifulSoup(html, "html.parser")
|
||||
|
||||
|
||||
# ---------- public API ----------
|
||||
|
||||
def extract(self, url: str, storage: CraftStorage, recursive: bool = True):
|
||||
if not url:
|
||||
logger.error('Битая ссылка')
|
||||
return
|
||||
if url in self.visited:
|
||||
logger.debug(f'Уже посещали: {url}')
|
||||
return
|
||||
|
||||
html = self._get_page_content(url)
|
||||
if not html:
|
||||
return
|
||||
|
||||
soup = self._get_soup(html)
|
||||
logger.info(f'Анализирую {url}')
|
||||
|
||||
self.visited.add(url)
|
||||
|
||||
all_links: set[str] = set()
|
||||
|
||||
try:
|
||||
links = simple_extractor.extract_crafts(soup, storage)
|
||||
all_links.update(links)
|
||||
links = device_extractor.extract_crafts(soup, storage)
|
||||
all_links.update(all_links)
|
||||
except Exception as e:
|
||||
logger.error(f'Ошибка в парсере: {e}')
|
||||
|
||||
logger.info(f'Готово: {url}')
|
||||
|
||||
if not recursive:
|
||||
return
|
||||
|
||||
logger.info('Начинаю обход внешних ссылок...')
|
||||
for link in all_links:
|
||||
self.extract(link, storage, recursive=True)
|
||||
Reference in New Issue
Block a user