ComicScrapy/Comics/loader.py
2024-07-22 00:52:50 +08:00

167 lines
7.4 KiB
Python

import json,logging
from scrapy.loader import ItemLoader
from Comics.settings import PROJECT_KEY
class ComicLoader(ItemLoader):
def parseExec(self,data,exec):
if data !=None and exec != None:
dots = str(exec).split(".")
if not isinstance(data,dict): data = json.loads(data)
for dot in dots:
if data != None: data = data.get(dot)
logging.debug(f"data= {data} dot={dot}")
return data
def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw):
"""
Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
value, which is used to extract a list of strings from the
selector associated with this :class:`ItemLoader`.
See :meth:`get_xpath` for ``kwargs``.
:param xpath: the XPath to extract data from
:type xpath: str
Examples::
# HTML snippet: <p class="product-name">Color TV</p>
loader.add_xpath('name', '//p[@class="product-name"]')
# HTML snippet: <p id="price">the price is $1200</p>
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
"""
values = self._get_xpathvalues(xpath, **kw)
if exec is not None:
values = self.parseExec(values, exec)
if index is not None:
if len(values) - 1 >= index:
try:
values = values[index]
except Exception as e:
logging.error(f"values [{values}] error index [{index}]")
logging.error(e)
else:
if is_null is not None: values = is_null
else: values=""
self.add_value(field_name, values, *processors, re=re, **kw)
def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
if str_exec is not None:
value = self.parseExec(value, str_exec)
self.add_value(field_name, value, *processors, re=re, **kw)
def get_exec(self, value, str_exec):
return self.parseExec(value, str_exec)
def add_value(self, field_name, value, *processors, re=None, **kw):
if self.auto_replace_value(field_name, value):
return super().add_value(field_name, value, *processors, re=re, **kw)
def auto_replace_value(self, field_name, value):
if self.get_output_value(field_name) != None:
self._replace_value(field_name, value)
return False
else: return True
# 设置漫画属性
def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
if value != None and sexec==None:
self.add_value(field_name=name, value=value)
if xpath != None:
self.add_xpath(field_name=name, xpath=xpath, index=index)
if sexec != None:
self.add_exec(field_name=name, value=value, str_exec=sexec)
# 工程名
def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
# 漫画名
def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
# 漫画封面链接
def icon(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('icon', value, xpath, index, sexec)
# 作者
def author(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('author', value, xpath, index, sexec)
# 标签
def tags(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('tags', value, xpath, index, sexec)
# 概述
def dep(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('dep', value, xpath, index, sexec)
# 时间
def date(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('date', value, xpath, index, sexec)
# 流派
def genre(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('genre', value, xpath, index, sexec)
# 年龄分级
def age_rating(self, value=None, xpath=None, index=None, sexec=None):
self.set_properties('age_rating', value, xpath, index, sexec)
# 全部章节
def chapters(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapters', value, xpath, index, sexec)
# 单一章节
def chapter(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapter', value, xpath, index, sexec)
# 图像名称
def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
# 图像链接
def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)
def get_output_value(self, field_name, skip_field=["chapter"]):
value = super().get_output_value(field_name)
try:
if isinstance(value, list) and len(value) == 1:
if field_name not in skip_field: value = value[0]
else: value = "".join(value)
except:
print(f"get_output_value value={value} type={type(value)}")
return value
# 漫画名称
def get_name(self): return self.get_output_value("name")
def get_sname(self): return self.get_output_value("s_name")
# 漫画章节
def get_chapter(self): return self.get_output_value("chapter")
def get_schapter(self): return self.get_output_value("s_chapter")
# 漫画封面
def get_icon(self): return self.get_output_value("icon")
# 工程名
def get_project_name(self): return self.get_output_value(PROJECT_KEY)
# 章节链接
def get_chapter_href(self): return self.get_output_value("chapter_href")
# 全部章节
def get_chapters(self): return self.get_output_value("chapters")
def get_chapter_api(self): return self.get_output_value("chapter_api")
def get_images(self): return self.get_output_value("images")
def get_image_urls(self): return self.get_output_value("image_urls")
### ComicEntity
def set_count(self, value): self.set_properties('count', value=value)
def set_index(self, value): self.set_properties('index', value=value)
def set_sname(self, value): self.set_properties('s_name', value=value)
def set_chapter(self, value): self.set_properties('chapter', value=value)
def set_schapter(self, value): self.set_properties('s_chapter', value=value)
# 章节页码
def count(self):
len_images = len(self.get_images())
if len_images != 0: return self.set_count(len_images)
# 章节编号
def index(self):
chapters, chapter = [ self.get_chapters(), self.get_chapter() ]
if chapter != None and len(chapters) > 0: return self.set_index(chapters.index(chapter) + 1)
def save_sname_schapter(self):
chapter = self.get_chapter()
name = self.get_name()
if chapter != None : self.set_schapter(chapter)
if name != None : self.set_sname(name)
def load_item(self, chapter=None):
self.count()
self.index()
if chapter != None: self.set_chapter(chapter)
self.save_sname_schapter()
return super().load_item()