158 lines
6.7 KiB
Python
158 lines
6.7 KiB
Python
import json,logging
|
|
from scrapy.loader import ItemLoader
|
|
from Comics.settings import PROJECT_KEY
|
|
|
|
class ComicLoader(ItemLoader):
|
|
def parseExec(self,data,exec):
|
|
if data !=None and exec != None:
|
|
dots = str(exec).split(".")
|
|
if not isinstance(data,dict): data = json.loads(data)
|
|
for dot in dots:
|
|
if data != None: data = data.get(dot)
|
|
logging.debug(f"data= {data} dot={dot}")
|
|
return data
|
|
|
|
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw):
|
|
"""
|
|
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
|
|
value, which is used to extract a list of strings from the
|
|
selector associated with this :class:`ItemLoader`.
|
|
|
|
See :meth:`get_xpath` for ``kwargs``.
|
|
|
|
:param xpath: the XPath to extract data from
|
|
:type xpath: str
|
|
|
|
Examples::
|
|
|
|
# HTML snippet: <p class="product-name">Color TV</p>
|
|
loader.add_xpath('name', '//p[@class="product-name"]')
|
|
# HTML snippet: <p id="price">the price is $1200</p>
|
|
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
|
|
|
|
"""
|
|
values = self._get_xpathvalues(xpath, **kw)
|
|
if exec is not None:
|
|
values = self.parseExec(values, exec)
|
|
if index is not None:
|
|
if len(values) - 1 >= index:
|
|
try:
|
|
values = values[index]
|
|
except Exception as e:
|
|
logging.error(f"values [{values}] error index [{index}]")
|
|
logging.error(e)
|
|
else:
|
|
if is_null is not None: values = is_null
|
|
else: values=""
|
|
self.add_value(field_name, values, *processors, re=re, **kw)
|
|
|
|
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
|
|
if str_exec is not None:
|
|
value = self.parseExec(value, str_exec)
|
|
self.add_value(field_name, value, *processors, re=re, **kw)
|
|
|
|
def get_exec(self, value, str_exec):
|
|
return self.parseExec(value, str_exec)
|
|
|
|
def add_value(self, field_name, value, *processors, re=None, **kw):
|
|
if self.auto_replace_value(field_name, value):
|
|
return super().add_value(field_name, value, *processors, re=re, **kw)
|
|
|
|
|
|
def auto_replace_value(self, field_name, value):
|
|
if self.get_output_value(field_name) != None:
|
|
self._replace_value(field_name, value)
|
|
return False
|
|
else: return True
|
|
|
|
|
|
# 设置漫画属性
|
|
def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
|
|
if value != None and sexec==None:
|
|
self.add_value(field_name=name, value=value)
|
|
if xpath != None:
|
|
self.add_xpath(field_name=name, xpath=xpath, index=index)
|
|
if sexec != None:
|
|
self.add_exec(field_name=name, value=value, str_exec=sexec)
|
|
|
|
# 工程名
|
|
def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
|
|
# 漫画名
|
|
def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
|
|
# 漫画封面链接
|
|
def icon(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('icon', value, xpath, index, sexec)
|
|
# 作者
|
|
def author(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('author', value, xpath, index, sexec)
|
|
# 标签
|
|
def tags(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('tags', value, xpath, index, sexec)
|
|
# 概述
|
|
def dep(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('dep', value, xpath, index, sexec)
|
|
# 时间
|
|
def date(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('date', value, xpath, index, sexec)
|
|
# 流派
|
|
def genre(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('genre', value, xpath, index, sexec)
|
|
# 年龄分级
|
|
def age_rating(self, value=None, xpath=None, index=None, sexec=None):
|
|
self.set_properties('age_rating', value, xpath, index, sexec)
|
|
# 全部章节
|
|
def chapters(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapters', value, xpath, index, sexec)
|
|
# 单一章节
|
|
def chapter(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapter', value, xpath, index, sexec)
|
|
# 图像名称
|
|
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
|
|
# 图像链接
|
|
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)
|
|
|
|
def get_output_value(self, field_name):
|
|
value = super().get_output_value(field_name)
|
|
try:
|
|
if isinstance(value, list) and len(value) == 1 : value = value[0]
|
|
except:
|
|
print(f"get_output_value value={value} type={type(value)}")
|
|
return value
|
|
|
|
# 漫画名称
|
|
def get_name(self): return self.get_output_value("name")
|
|
# 漫画章节
|
|
def get_chapter(self): return self.get_output_value("chapter")
|
|
# 工程名
|
|
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
|
|
# 章节链接
|
|
def get_chapter_href(self): return self.get_output_value("chapter_href")
|
|
# 全部章节
|
|
def get_chapters(self): return self.get_output_value("chapters")
|
|
|
|
def get_chapter_api(self): return self.get_output_value("chapter_api")
|
|
|
|
def get_image_urls(self): return self.get_output_value("image_urls")
|
|
|
|
class ComicEntity:
|
|
|
|
ENTITY = None
|
|
|
|
def __init__(self, entity):
|
|
self.ENTITY = entity
|
|
|
|
# 属性获取
|
|
def get_dict(self, key):
|
|
try:
|
|
return self.ENTITY[key]
|
|
except:
|
|
return []
|
|
|
|
def set(self, key, value): self.ENTITY[key] = value
|
|
def set_count(self, value): self.set('count', value)
|
|
def set_index(self, value): self.set('index', value)
|
|
def images(self): return self.get_dict("images")
|
|
def image_urls(self): return self.get_dict("image_urls")
|
|
def chapters(self): return self.get_dict("chapters")
|
|
def chapter(self): return self.get_dict("chapter")
|
|
def icon(self): return self.get_dict("icon")
|
|
def count(self): return self.set_count(len(self.images()))
|
|
def index(self): return self.set_index(self.chapters().index(self.chapter()) + 1)
|
|
|
|
def item(self):
|
|
self.count()
|
|
self.index()
|
|
return self.ENTITY
|