ComicScrapy/Comics/loader.py

import json,logging
from scrapy.loader import ItemLoader
from Comics.settings import PROJECT_KEY

class ComicLoader(ItemLoader):
    def parseExec(self,data,exec):
        if data !=None and exec != None:
            dots = str(exec).split(".")
            if not isinstance(data,dict): data = json.loads(data)
            for dot in dots:
                if data != None: data = data.get(dot)
                logging.debug(f"data= {data} dot={dot}")
        return data

    def add_xpath(self, field_name, xpath, *processors, index=None, exec=None, re=None, is_null=None, **kw):
        """
        Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
        value, which is used to extract a list of strings from the
        selector associated with this :class:`ItemLoader`.

        See :meth:`get_xpath` for ``kwargs``.

        :param xpath: the XPath to extract data from
        :type xpath: str

        Examples::

            # HTML snippet: <p class="product-name">Color TV</p>
            loader.add_xpath('name', '//p[@class="product-name"]')
            # HTML snippet: <p id="price">the price is $1200</p>
            loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')

        """
        values = self._get_xpathvalues(xpath, **kw)
        if exec is not None:
            values = self.parseExec(values, exec)
        if index is not None:
            if len(values) - 1 >= index:
                try:
                    values = values[index]
                except Exception as e:
                    logging.error(f"values [{values}] error index [{index}]")
                    logging.error(e)
            else:
                if is_null is not None: values = is_null
                else: values=""
        self.add_value(field_name, values, *processors, re=re, **kw)

    def add_exec(self, field_name, value, str_exec=None, *processors, re=None, **kw):
        if str_exec is not None:
            value = self.parseExec(value, str_exec)
        self.add_value(field_name, value, *processors, re=re, **kw)

    def get_exec(self, value, str_exec):
        return self.parseExec(value, str_exec)

    def add_value(self, field_name, value, *processors, re=None, **kw):
        if self.auto_replace_value(field_name, value):
            return super().add_value(field_name, value, *processors, re=re, **kw)


    def auto_replace_value(self, field_name, value):
        if self.get_output_value(field_name) != None:
                self._replace_value(field_name, value)
                return False
        else: return True


    # 设置漫画属性
    def set_properties(self, name, value=None, xpath=None, index=None, sexec=None):
        if value != None and sexec==None:
            self.add_value(field_name=name, value=value)
        if xpath != None:
            self.add_xpath(field_name=name, xpath=xpath, index=index)
        if sexec != None:
            self.add_exec(field_name=name, value=value, str_exec=sexec)

    # 工程名
    def project_name(self, project_name): self.add_value(PROJECT_KEY, project_name)
    # 漫画名
    def name(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('name', value, xpath, index, sexec)
    # 漫画封面链接
    def icon(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('icon', value, xpath, index, sexec)
    # 作者
    def author(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('author', value, xpath, index, sexec)
    # 标签
    def tags(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('tags', value, xpath, index, sexec)
    # 概述
    def dep(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('dep', value, xpath, index, sexec)
    # 时间
    def date(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('date', value, xpath, index, sexec)
    # 流派
    def genre(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('genre', value, xpath, index, sexec)
    # 年龄分级
    def age_rating(self, value=None, xpath=None, index=None, sexec=None):
        self.set_properties('age_rating', value, xpath, index, sexec)
    # 全部章节
    def chapters(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapters', value, xpath, index, sexec)
    # 单一章节
    def chapter(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('chapter', value, xpath, index, sexec)
    # 图像名称
    def images(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('images', value, xpath, index, sexec)
    # 图像链接
    def image_urls(self, value=None, xpath=None, index=None, sexec=None): self.set_properties('image_urls', value, xpath, index, sexec)

    def get_output_value(self, field_name, skip_field=["chapter"]):
        value = super().get_output_value(field_name)
        try:
            if isinstance(value, list) and len(value) == 1:
                if field_name not in skip_field: value = value[0]
                else: value = "".join(value)
        except:
            print(f"get_output_value value={value} type={type(value)}")
        return value

    # 漫画名称
    def get_name(self): return self.get_output_value("name")
    def get_sname(self): return self.get_output_value("s_name")
    # 漫画章节
    def get_chapter(self): return self.get_output_value("chapter")
    def get_schapter(self): return self.get_output_value("s_chapter")
    # 漫画封面
    def get_icon(self): return self.get_output_value("icon")
    # 工程名
    def get_project_name(self): return self.get_output_value(PROJECT_KEY)
    # 章节链接
    def get_chapter_href(self): return self.get_output_value("chapter_href")
    # 全部章节
    def get_chapters(self): return self.get_output_value("chapters")

    def get_chapter_api(self): return self.get_output_value("chapter_api")

    def get_images(self): return self.get_output_value("images")

    def get_image_urls(self): return self.get_output_value("image_urls")

    ### ComicEntity
    def set_count(self, value): self.set_properties('count', value=value)

    def set_index(self, value): self.set_properties('index', value=value)

    def set_sname(self, value): self.set_properties('s_name', value=value)

    def set_chapter(self, value): self.set_properties('chapter', value=value)
    def set_schapter(self, value): self.set_properties('s_chapter', value=value)

    # 章节页码
    def count(self):
        len_images = len(self.get_images())
        if len_images != 0: return self.set_count(len_images)
    # 章节编号
    def index(self):
        chapters, chapter = [ self.get_chapters(), self.get_chapter() ]
        if chapter != None and len(chapters) > 0: return self.set_index(chapters.index(chapter) + 1)

    def save_sname_schapter(self):
       chapter = self.get_chapter()
       name = self.get_name()
       if chapter != None : self.set_schapter(chapter)
       if name != None : self.set_sname(name)

    def load_item(self, chapter=None):
        self.count()
        self.index()
        if chapter != None: self.set_chapter(chapter)
        self.save_sname_schapter()
        return super().load_item()