Source code for ebooklib.plugins.standard

# This file is part of EbookLib.
# Copyright (c) 2013 Aleksandar Erkalovic <aerkalov@gmail.com>
#
# EbookLib is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# EbookLib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with EbookLib.  If not, see <http://www.gnu.org/licenses/>.

import six

from ebooklib.plugins.base import BasePlugin
from ebooklib.utils import parse_html_string

# TODO:
#   - should also look for the _required_ elements
# http://www.w3.org/html/wg/drafts/html/master/tabular-data.html#the-table-element

ATTRIBUTES_GLOBAL = [
    "accesskey",
    "class",
    "contenteditable",
    "contextmenu",
    "dir",
    "draggable",
    "dropzone",
    "hidden",
    "id",
    "inert",
    "itemid",
    "itemprop",
    "itemref",
    "itemscope",
    "itemtype",
    "lang",
    "spellcheck",
    "style",
    "tabindex",
    "title",
    "translate",
    "epub:type",
]

# Remove <u> for now from here
DEPRECATED_TAGS = [
    "acronym",
    "applet",
    "basefont",
    "big",
    "center",
    "dir",
    "font",
    "frame",
    "frameset",
    "isindex",
    "noframes",
    "s",
    "strike",
    "tt",
]


[docs] def leave_only(item, tag_list): for _attr in six.iterkeys(item.attrib): if _attr not in tag_list: del item.attrib[_attr]
[docs] class SyntaxPlugin(BasePlugin): NAME = "Check HTML syntax"
[docs] def html_before_write(self, book, chapter): from lxml import etree try: tree = parse_html_string(chapter.content) except Exception: return root = tree.getroottree() # delete deprecated tags # i should really have a list of allowed tags for tag in DEPRECATED_TAGS: etree.strip_tags(root, tag) head = tree.find("head") if head is not None and len(head) != 0: for _item in head: if _item.tag == "base": leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target"]) elif _item.tag == "link": leave_only( _item, ATTRIBUTES_GLOBAL + ["href", "crossorigin", "rel", "media", "hreflang", "type", "sizes"] ) elif _item.tag == "title": if _item.text == "": head.remove(_item) elif _item.tag == "meta": leave_only(_item, ATTRIBUTES_GLOBAL + ["name", "http-equiv", "content", "charset"]) # just remove for now, but really should not be like this head.remove(_item) elif _item.tag == "script": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "charset", "async", "defer", "crossorigin"]) elif _item.tag == "source": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "media"]) elif _item.tag == "style": leave_only(_item, ATTRIBUTES_GLOBAL + ["media", "type", "scoped"]) else: leave_only(_item, ATTRIBUTES_GLOBAL) if len(root.find("body")) != 0: body = tree.find("body") for _item in body.iter(): # it is not # <a class="indexterm" href="ch05.html#ix_epub:trigger_element"> if _item.tag == "a": leave_only(_item, ATTRIBUTES_GLOBAL + ["href", "target", "download", "rel", "hreflang", "type"]) elif _item.tag == "area": leave_only( _item, ATTRIBUTES_GLOBAL + ["alt", "coords", "shape", "href", "target", "download", "rel", "hreflang", "type"], ) elif _item.tag == "audio": leave_only( _item, ATTRIBUTES_GLOBAL + ["src", "crossorigin", "preload", "autoplay", "mediagroup", "loop", "muted", "controls"], ) elif _item.tag == "blockquote": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite"]) elif _item.tag == "button": leave_only( _item, ATTRIBUTES_GLOBAL + [ "autofocus", "disabled", "form", "formaction", "formenctype", "formmethod", "formnovalidate", "formtarget", "name", "type", "value", "menu", ], ) elif _item.tag == "canvas": leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) elif _item.tag == "canvas": leave_only(_item, ATTRIBUTES_GLOBAL + ["width", "height"]) elif _item.tag == "del": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite", "datetime"]) elif _item.tag == "details": leave_only(_item, ATTRIBUTES_GLOBAL + ["open"]) elif _item.tag == "embed": leave_only(_item, ATTRIBUTES_GLOBAL + ["src", "type", "width", "height"]) elif _item.tag == "fieldset": leave_only(_item, ATTRIBUTES_GLOBAL + ["disable", "form", "name"]) elif _item.tag == "details": leave_only( _item, ATTRIBUTES_GLOBAL + [ "accept-charset", "action", "autocomplete", "enctype", "method", "name", "novalidate", "target", ], ) elif _item.tag == "iframe": leave_only( _item, ATTRIBUTES_GLOBAL + ["src", "srcdoc", "name", "sandbox", "seamless", "allowfullscreen", "width", "height"], ) elif _item.tag == "img": _src = _item.get("src", "").lower() if _src.startswith("http://") or _src.startswith("https://"): if "remote-resources" not in chapter.properties: chapter.properties.append("remote-resources") # THIS DOES NOT WORK, ONLY VIDEO AND AUDIO FILES CAN BE REMOTE RESOURCES # THAT MEANS I SHOULD ALSO CATCH <SOURCE TAG from ebooklib import epub _img = epub.EpubImage(file_name=_item.get("src")) book.add_item(_img) leave_only( _item, ATTRIBUTES_GLOBAL + ["alt", "src", "crossorigin", "usemap", "ismap", "width", "height"] ) elif _item.tag == "input": leave_only( _item, ATTRIBUTES_GLOBAL + [ "accept", "alt", "autocomplete", "autofocus", "checked", "dirname", "disabled", "form", "formaction", "formenctype", "formmethod", "formnovalidate", "formtarget", "height", "inputmode", "list", "max", "maxlength", "min", "multiple", "name", "pattern", "placeholder", "readonly", "required", "size", "src", "steptype", "value", "width", ], ) elif _item.tag == "ins": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite", "datetime"]) elif _item.tag == "keygen": leave_only( _item, ATTRIBUTES_GLOBAL + ["autofocus", "challenge", "disabled", "form", "keytype", "name"] ) elif _item.tag == "label": leave_only(_item, ATTRIBUTES_GLOBAL + ["form", "for"]) elif _item.tag == "label": leave_only(_item, ATTRIBUTES_GLOBAL + ["form", "for"]) elif _item.tag == "map": leave_only(_item, ATTRIBUTES_GLOBAL + ["name"]) elif _item.tag == "menu": leave_only(_item, ATTRIBUTES_GLOBAL + ["type", "label"]) elif _item.tag == "object": leave_only( _item, ATTRIBUTES_GLOBAL + ["data", "type", "typemustmatch", "name", "usemap", "form", "width", "height"], ) elif _item.tag == "ol": leave_only(_item, ATTRIBUTES_GLOBAL + ["reversed", "start", "type"]) elif _item.tag == "optgroup": leave_only(_item, ATTRIBUTES_GLOBAL + ["disabled", "label"]) elif _item.tag == "option": leave_only(_item, ATTRIBUTES_GLOBAL + ["disabled", "label", "selected", "value"]) elif _item.tag == "output": leave_only(_item, ATTRIBUTES_GLOBAL + ["for", "form", "name"]) elif _item.tag == "param": leave_only(_item, ATTRIBUTES_GLOBAL + ["name", "value"]) elif _item.tag == "progress": leave_only(_item, ATTRIBUTES_GLOBAL + ["value", "max"]) elif _item.tag == "q": leave_only(_item, ATTRIBUTES_GLOBAL + ["cite"]) elif _item.tag == "select": leave_only( _item, ATTRIBUTES_GLOBAL + ["autofocus", "disabled", "form", "multiple", "name", "required", "size"], ) elif _item.tag == "table": if _item.get("border", None): if _item.get("border") == "0": _item.set("border", "") if _item.get("summary", None): _caption = etree.Element("caption", {}) _caption.text = _item.get("summary") _item.insert(0, _caption) # add it as caption del _item.attrib["summary"] leave_only(_item, ATTRIBUTES_GLOBAL + ["border", "sortable"]) elif _item.tag == "dl": _d = _item.find("dd") if _d is not None and len(_d) == 0: pass # http://html5doctor.com/the-dl-element/ # should be like this really # some of the elements can be missing # dl # dt # dd # dt # dd elif _item.tag == "td": leave_only(_item, ATTRIBUTES_GLOBAL + ["colspan", "rowspan", "headers"]) elif _item.tag == "textarea": leave_only( _item, ATTRIBUTES_GLOBAL + [ "autocomplete", "autofocus", "cols", "dirname", "disabled", "form", "inputmode", "maxlength", "name", "placeholder", "readonly", "required", "rows", "wrap", ], ) elif _item.tag in ["col", "colgroup"]: leave_only(_item, ATTRIBUTES_GLOBAL + ["span"]) elif _item.tag == "th": leave_only(_item, ATTRIBUTES_GLOBAL + ["colspan", "rowspan", "headers", "scope", "abbr", "sorted"]) elif _item.tag in ["time"]: leave_only(_item, ATTRIBUTES_GLOBAL + ["datetime"]) elif _item.tag in ["track"]: leave_only(_item, ATTRIBUTES_GLOBAL + ["kind", "src", "srclang", "label", "default"]) elif _item.tag == "video": leave_only( _item, ATTRIBUTES_GLOBAL + [ "src", "crossorigin", "poster", "preload", "autoplay", "mediagroup", "loop", "muted", "controls", "width", "height", ], ) elif _item.tag == "svg": # We need to add property "svg" in case we have embeded svg file if "svg" not in chapter.properties: chapter.properties.append("svg") if _item.get("viewbox", None): del _item.attrib["viewbox"] if _item.get("preserveaspectratio", None): del _item.attrib["preserveaspectratio"] else: for _attr in six.iterkeys(_item.attrib): if _attr not in ATTRIBUTES_GLOBAL: del _item.attrib[_attr] chapter.content = etree.tostring(tree, pretty_print=True, encoding="utf-8", xml_declaration=True) return chapter.content