ComicScrapy/Comics/loader.py
2024-02-20 21:08:13 +08:00

158 lines
6.7 KiB
Python

import json,logging
from scrapy.loader import ItemLoader
from Comics.settings import PROJECT_KEY
class ComicLoader(ItemLoader):
def parseExec(self,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
if data != None: data = data.get(dot)
logging.debug(f"data= {data} dot={dot}")
return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
if exec is not None:
values = self.parseExec(values, exec)
if index is not None:
if len(values) - 1 >= index:
try:
values = values[index]
except Exception as e:
logging.error(f"values [{values}] error index [{index}]")
logging.error(e)
else:
if is_null is not None: values = is_null
else: values=""
self.add_value(field_name, values, *processors, re=re, **kw)
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
if str_exec is not None:
value = self.parseExec(value, str_exec)
self.add_value(field_name, value, *processors, re=re, **kw)
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw)
def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None:
self._replace_value(field_name, value)
return False
else: return True
# 设置漫画属性
def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
if value != None and sexec==None:
self.add_value(field_name=name, value=value)
if xpath != None:
self.add_xpath(field_name=name, xpath=xpath, index=index)
if sexec != None:
self.add_exec(field_name=name, value=value, str_exec=sexec)
# 工程名
def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
# 漫画名
def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
# 漫画封面链接
def icon(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('icon', value, xpath, index, sexec)
# 作者
def author(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('author', value, xpath, index, sexec)
# 标签
def tags(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('tags', value, xpath, index, sexec)
# 概述
def dep(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('dep', value, xpath, index, sexec)
# 时间
def date(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('date', value, xpath, index, sexec)
# 流派
def genre(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('genre', value, xpath, index, sexec)
# 年龄分级
def age_rating(self, value=None, xpath=None, index=None, sexec=None):
self.set_properties('age_rating', value, xpath, index, sexec)
# 全部章节
def chapters(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapters', value, xpath, index, sexec)
# 单一章节
def chapter(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapter', value, xpath, index, sexec)
# 图像名称
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
# 图像链接
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)
def get_output_value(self, field_name):
value = super().get_output_value(field_name)
try:
if isinstance(value, list) and len(value) == 1 : value = value[0]
except:
print(f"get_output_value value={value} type={type(value)}")
return value
# 漫画名称
def get_name(self): return self.get_output_value("name")
# 漫画章节
def get_chapter(self): return self.get_output_value("chapter")
# 工程名
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
# 章节链接
def get_chapter_href(self): return self.get_output_value("chapter_href")
# 全部章节
def get_chapters(self): return self.get_output_value("chapters")
def get_chapter_api(self): return self.get_output_value("chapter_api")
def get_image_urls(self): return self.get_output_value("image_urls")
class ComicEntity:
ENTITY = None
def __init__(self, entity):
self.ENTITY = entity
# 属性获取
def get_dict(self, key):
try:
return self.ENTITY[key]
except:
return []
def set(self, key, value): self.ENTITY[key] = value
def set_count(self, value): self.set('count', value)
def set_index(self, value): self.set('index', value)
def images(self): return self.get_dict("images")
def image_urls(self): return self.get_dict("image_urls")
def chapters(self): return self.get_dict("chapters")
def chapter(self): return self.get_dict("chapter")
def icon(self): return self.get_dict("icon")
def count(self): return self.set_count(len(self.images()))
def index(self): return self.set_index(self.chapters().index(self.chapter()) + 1)
def item(self):
self.count()
self.index()
return self.ENTITY