ComicScrapy/Comics/exporters.py
2024-02-20 21:08:13 +08:00

130 lines
5.2 KiB
Python

import os.path,json,ast
from Comics.settings import COMIC_INFO_FIELDS_TO_EXPORT
from scrapy.exporters import XmlItemExporter
from scrapy.exporters import PythonItemExporter
from scrapy.exporters import JsonItemExporter
from Comics.items import ComicInfoItem
from Comics.items import ComicItem
from Comics.settings import COMIC_INFO_XML_STORE
from Comics.utils import ComicPath
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from itemadapter import ItemAdapter
class CommonExporter():
def getPath(self, file , sufix=None):
sufix = "."+sufix
dirname = os.path.dirname(file)
if not os.path.exists(dirname):
os.makedirs(dirname)
if sufix != None and sufix not in file:
file = file + sufix
return file
class ItemExporter(PythonItemExporter):
def convert(self, data):
if isinstance(data, bytes): return data.decode("utf-8")
if isinstance(data, dict): return dict(map(self.convert, data.items()))
if isinstance(data, tuple): return map(self.convert, data)
if isinstance(data, list): return [self.convert(i) for i in data]
return data
def export_obj(self, obj_item):
self.start_exporting()
obj_item = self.convert(self.export_item(obj_item))
self.finish_exporting()
return obj_item
class JsonExport(JsonItemExporter):
def __init__(self, file, **kwargs):
file = CommonExporter().getPath(file=file, sufix= "json")
self.file = open(file, "wb")
super(JsonExport, self).__init__(self.file, **kwargs)
def export_json(self, json_object, if_return=False):
self.start_exporting()
self.export_item(json_object)
self.finish_exporting()
self.file.close()
if if_return:
return ItemExporter().export_obj(json_object)
class ComicInfoXmlItemExporter(XmlItemExporter):
custom_root_element = "ComicInfo"
def __init__(self, dir):
file_path = os.path.join(COMIC_INFO_XML_STORE, dir,
f"{self.custom_root_element}.xml")
dir_path = os.path.dirname(file_path)
if not os.path.exists(dir_path): os.makedirs(dir_path)
self.xml_file = open(file_path, "wb")
super(ComicInfoXmlItemExporter, self).__init__(self.xml_file,
root_element=self.custom_root_element,
indent=1,fields_to_export=COMIC_INFO_FIELDS_TO_EXPORT)
def serialize_field(self, field, name, value):
#通过序列化
value = ComicPath.chinese_convert(value)
return super().serialize_field(field, name, value)
def start_exporting(self):
self.xg.startDocument()
self.xg.startElement(self.custom_root_element, {})
def comic_to_info_item(self, comic_item):
comic_info = {}
info_item = ItemAdapter(ComicInfoItem())
comic_info_dict = {}
for field in info_item.field_names():
meta_info = info_item.get_field_meta(field).get('info')
if meta_info is not None:
comic_info_dict[meta_info] = field
for key, value in ComicItem(comic_item).items():
new_key = comic_info_dict.get(key)
if new_key is not None:
comic_info[new_key] = value
return ItemExporter().export_obj(ComicInfoItem(comic_info))
def export_item(self, item):
comic_info = self.comic_to_info_item(item)
child_element = "Page"
self._beautify_indent(depth=1)
self._beautify_newline()
for name, value in self._get_serialized_fields(comic_info, default_value=""):
if name == "Pages":
value = ast.literal_eval(value)
if value is not None or value != "":
self._export_xml_field(name, value, depth=2, child_element=child_element)
#self._beautify_indent(depth=1)
return comic_info
def _export_xml_field(self, name, serialized_value, depth, child_element="value"):
self._beautify_indent(depth=depth)
self.xg.startElement(name, {})
if hasattr(serialized_value, "items"):
self._beautify_newline()
for sub_name, value in serialized_value.items():
self._export_xml_field(sub_name, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif is_listlike(serialized_value):
self._beautify_newline()
for value in serialized_value:
self._export_xml_field(child_element, value, depth=depth + 1)
self._beautify_indent(depth=depth)
elif isinstance(serialized_value, str):
self.xg.characters(serialized_value)
else:
self.xg.characters(str(serialized_value))
self.xg.endElement(name)
self._beautify_newline()
def finish_exporting(self):
self.xg.endElement(self.custom_root_element)
self.xg.endDocument()
self.xml_file.close()
def export_xml(self, item):
self.start_exporting()
comic_info = self.export_item(item)
self.finish_exporting()
return comic_info