Files
CraftCalc/craft_parser/simple_extractor.py
2026-01-19 23:58:25 +07:00

235 lines
9.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from collections import Counter
import re
from bs4 import BeautifulSoup
from craft import CraftComponent, CraftItem, CraftRecipe
from craft_storage import CraftStorage
from dto import ParsedItem, ParsedRecipeInput, SpriteData
from loguru import logger
def extract_crafts(soup: BeautifulSoup, storage: CraftStorage) -> set[str]:
craft_containers = soup.find_all("div", class_="mcui-Crafting_Table-container")
if(craft_containers.__len__() < 1):
logger.info('Классических рецептов не найдено')
return set()
src_links : set[str] = set()
for container in craft_containers:
is_hidden = check_is_hidden(container)
if(is_hidden):
continue
output_item = parse_craft_item(container)
if(output_item is None):
continue
input_kit = parse_craft_components_and_recipe(container, output_item.item.name, output_item.amount)
if(input_kit is None):
continue
src_links.update(input_kit.source_links)
already_exists = not storage.try_add_recipe_signature(output_item.item.name, input_kit.components, 'classic')
if(already_exists):
continue
storage.add_item(output_item.item)
for input_item in input_kit.items:
storage.add_item(input_item)
recipe_id = storage.add_recipe(input_kit.recipe)
for component in input_kit.components:
component.recipe_id = recipe_id
storage.add_component(component)
logger.info(f'Добавлен рецепт крафта для предмета {output_item.item.name}')
return src_links
def check_is_hidden(container : BeautifulSoup) -> bool:
if(container.find_parent('table', class_='collapsed')):
logger.warning(f'{container} Не будет исследован. Причина: помечен как скрытый')
return True
return False
def parse_craft_item(container) -> ParsedItem|None:
output_amount = 1
output_span = container.find('span', class_='mcui-output')
if(output_span is None):
logger.error(f'ошибка для \n{container}\n: не найдено ячейки с результатом!')
return None
output_span_name_container = output_span.find('span', class_='invslot-item')
if(output_span_name_container is None):
logger.error(f'ошибка для \n{output_span}\n: не найден текстовый контейнер!')
return None
data_from_span = extract_data_from_sprite_span(output_span_name_container)
if(data_from_span is not None):
output_item_title = data_from_span.title
output_item_img_shift = data_from_span.shift
output_item_img_url = data_from_span.img_url
else:
data_from_img = extract_data_from_sprite_img(output_span_name_container)
if(data_from_img is None):
logger.error(f'ошибка для \n{output_span_name_container}\n: не удалось извлечь картинку и заголовок ни одним из способов')
return None
output_item_title = data_from_img.title
output_item_img_shift = data_from_img.shift
output_item_img_url = data_from_img.img_url
item = CraftItem(output_item_title, output_item_img_url, output_item_img_shift)
return ParsedItem(item=item, amount=output_amount)
def parse_craft_components_and_recipe(container : BeautifulSoup, output_item_name: str, output_count = 1) -> ParsedRecipeInput|None:
input_span = container.find('span', class_='mcui-input')
if(input_span is None):
logger.error(f'ошибка для \n{container}\n: не найдено рецепта крафта')
return None
input_span_name_containers = input_span.find_all('span', class_='invslot-item')
if(input_span_name_containers is None or input_span_name_containers.__len__() == 0):
logger.error(f'Ошибка для \n{input_span}\n: не найдено айтемов в рецепте')
return None
craft_items = []
recipe = None
craft_components = []
src_links = []
#existing_items = set()
for input_span_name_container in input_span_name_containers:
if(input_span_name_container.contents is None or input_span_name_container.contents.__len__() == 0):
logger.warning('Пустая ячейка, скипаю')
continue
if(input_span_name_container is None):
logger.error('ошибка для набора контейнеров: не найден текстовый контейнер!')
return None
data_from_span = extract_data_from_sprite_span(input_span_name_container)
if(data_from_span is not None):
input_item_title = data_from_span.title
input_item_img_shift = data_from_span.shift
input_item_img_url = data_from_span.img_url
if(data_from_span.source_link is not None):
src_links.append(data_from_span.source_link)
else:
data_from_img = extract_data_from_sprite_img(input_span_name_container)
if(data_from_img is None):
logger.error(f'ошибка для \n{input_span_name_container}\n: не удалось извлечь картинку и заголовок ни одним из способов')
return None
input_item_title = data_from_img.title
input_item_img_shift = data_from_img.shift
input_item_img_url = data_from_img.img_url
craft_items.append(CraftItem(input_item_title, input_item_img_url, input_item_img_shift))
counter = Counter(c.name for c in craft_items)
recipe = CraftRecipe(output_item_name, output_count, 'Верстак')
for element in counter.most_common(50):
##WARN айдишник изменится при сохранении
craft_components.append(CraftComponent(-1, element[0], element[1]))
return ParsedRecipeInput(
items=craft_items,
recipe=recipe,
components=craft_components,
source_links=[x for x in src_links if x is not None]
)
def extract_data_from_sprite_img(input_span_name_container) -> SpriteData|None:
input_item_img = input_span_name_container.find('img')
if(input_item_img is None):
return None
title = input_item_img.get('alt')
if(title is None):
logger.error(f'ошибка для \n{input_span_name_container}\n: не найдено описание картинки!')
return None
url = input_item_img.get('src')
if(url is None):
logger.error(f'ошибка для \n{input_span_name_container}\n: не найден url картинки!')
return None
return SpriteData(
amount=1,
title=title,
shift=(0, 0),
img_url=url,
source_link=None)
def extract_data_from_sprite_span(input_span_name_container) -> SpriteData|None:
input_item_sprite_span = input_span_name_container.find('span', class_='sprite')
#Контейнера со спрайтом может не быть!
if(input_item_sprite_span is None):
return None
input_item_title = input_item_sprite_span.get('title')
if(input_item_title is None):
logger.error(f'ошибка для \n{input_span_name_container}\n: не найдено заголовка предмета')
return None
output_amount=1
amount_container = input_span_name_container.find('span', class_='invslot-stacksize')
if(amount_container is not None):
logger.info(f'Для объекта {input_item_title} найдено количество : {amount_container.text}')
output_amount = int(amount_container.text)
item_source_link = get_item_source_link(input_span_name_container)
input_item_img_url = extract_img_classname(input_item_sprite_span)
if(input_item_img_url is None):
logger.error(f'Ошибка для \n{input_item_img_url}\n: не найден файл спрайта')
return None
input_item_img_shift = extract_img_shift(input_item_sprite_span)
if(input_item_img_shift is None):
logger.error(f'Ошибка для \n{input_item_sprite_span}\n: не найдено смещения для спрайта')
return None
return SpriteData(
amount=output_amount,
title=input_item_title,
shift=input_item_img_shift,
img_url=input_item_img_url,
source_link=item_source_link
)
def extract_img_shift(output_item_sprite_span):
if(output_item_sprite_span is None):
return None
css_value = output_item_sprite_span.get('style')
if(css_value is None):
return None
# Шаблон: ищем два числа (с возможным минусом) перед 'px'
pattern = r'(-?\d+)px\s+(-?\d+)px'
match = re.search(pattern, css_value)
if match:
x = int(match.group(1))
y = int(match.group(2))
return (x, y)
else:
return None # Если шаблон не найден
def extract_img_classname(sprite_span):
if(sprite_span is None):
return None
'sprite industrialcraft-2-inv-sprite'
classes = sprite_span.get('class', [])
for cls in classes:
if cls.endswith('-sprite'):
return cls
return None # Если не найдено
def get_item_source_link(input_span_name_container) -> str|None:
link_container = input_span_name_container.find('a')
if(link_container is None):
return None
link = link_container.get('href')
if(link is None):
return None
return f'https://ru.minecraft.wiki{link}'