Source code for docx_utils.flatten

# coding: utf-8
u"""
Docx to flat XML converter
==========================

This converter is inspired from Eric White’s article: `Transforming Open XML Documents to Flat OPC Format
<https://blogs.msdn.microsoft.com/ericwhite/2008/09/29/transforming-open-xml-documents-to-flat-opc-format/>`_.

    This post describes the process of conversion of an Open XML (OPC) document
    into a Flat OPC document, and presents the C# function, OpcToFlat.

The function :func:`~docx_utils.flatten.opc_to_flat_opc` is used to convert
an Open XML document (.docx, .xlsx, .pptx) into a flat OPC format (.xml).
"""
import base64
import collections
import io
import itertools
import mimetypes
import os
import zipfile

import six.moves.urllib.request
from lxml import etree

from docx_utils.exceptions import UnknownContentTypeError


[docs]class ContentTypes(object): """ ContentTypes contained in a "[Content_Types].xml" file. """ NS = {"ct": u"http://schemas.openxmlformats.org/package/2006/content-types"} def __init__(self): self._defaults = {} self._overrides = {}
[docs] def parse_xml_data(self, data): tree = etree.fromstring(data) # type: etree._Element self._defaults = { n.attrib[u"Extension"]: n.attrib[u"ContentType"] for n in tree.xpath(u"//ct:Default", namespaces=self.NS) } self._overrides = { n.attrib[u"PartName"]: n.attrib[u"ContentType"] for n in tree.xpath(u"//ct:Override", namespaces=self.NS) }
[docs] def resolve(self, part_name): basename = os.path.basename(part_name) ext = basename.rsplit(".", 1)[1] content_type = self._overrides.get(part_name) or self._defaults.get(ext) return content_type or mimetypes.guess_type(part_name, strict=True)[0]
PackagePart = collections.namedtuple("PackagePart", ["uri", "content_type", "data"])
[docs]def iter_package(opc_path, on_error="ignore"): """ Iterate a Open XML document and yield the package parts. :param str opc_path: Microsoft Office document to read (.docx, .xlsx, .pptx) :param str on_error: control the way errors are handled when a part URI cannot be resolved: - 'ignore": ignore the part, - 'strict': raise an exception. :return: Iterator which yield package parts :raise UnknownContentTypeError: if a part URI cannot be resolved. """ content_types = ContentTypes() with zipfile.ZipFile(opc_path, mode="r") as f: for name in f.namelist(): if name == "[Content_Types].xml": content_types.parse_xml_data(f.read(name)) else: uri = six.moves.urllib.request.pathname2url(name) uri = uri.decode("utf-8") if isinstance(uri, six.binary_type) else uri content_type = content_types.resolve(uri) if content_type is None: if on_error == "strict": raise UnknownContentTypeError(opc_path, uri) elif on_error == "ignore": pass else: raise ValueError(on_error) else: data = f.read(name) yield PackagePart(uri, content_type, data)
[docs]def opc_to_flat_opc(src_path, dst_path, on_error="ignore"): """ Convert an Open XML document into a flat OPC format. :param str src_path: Microsoft Office document to convert (.docx, .xlsx, .pptx) :param str dst_path: Microsoft Office document converted into flat OPC format (.xml) :param str on_error: control the way errors are handled when a part URI cannot be resolved: - 'ignore": ignore the part, - 'strict': raise an exception. """ pkg = u"http://schemas.microsoft.com/office/2006/xmlPackage" ext = os.path.splitext(src_path)[1].lower() progid = { ".docx": u"Word.Document", ".xlsx": u"Excel.Sheet", ".pptx": u"PowerPoint.Show", }[ext] content = ( u'<?mso-application progid="{progid}"?>' u'<pkg:package xmlns:pkg="{pkg}"/>' ).format(progid=progid, pkg=pkg) document = etree.parse(io.StringIO(content)) # type: etree._ElementTree root = document.getroot() ns = {"pkg": pkg} for part in iter_package(src_path, on_error=on_error): node = etree.SubElement(root, u"{{{pkg}}}part".format(pkg=pkg), nsmap=ns) node.attrib[u"{{{pkg}}}name".format(pkg=pkg)] = part.uri node.attrib[u"{{{pkg}}}contentType".format(pkg=pkg)] = part.content_type if part.content_type.endswith("xml"): data = etree.SubElement(node, u"{{{pkg}}}xmlData".format(pkg=pkg), nsmap=ns) data.append(etree.fromstring(part.data)) else: node.attrib[u"{{{pkg}}}compression".format(pkg=pkg)] = "store" data = etree.SubElement( node, u"{{{pkg}}}binaryData".format(pkg=pkg), nsmap=ns ) encoded = base64.b64encode(part.data).decode() # bytes -> str iterable = iter(encoded) chunks = list(iter(lambda: list(itertools.islice(iterable, 76)), [])) chunks = u"\n".join(u"".join(chunk) for chunk in chunks) data.text = chunks content = etree.tostring( document, xml_declaration=True, encoding="UTF-8", pretty_print=False, with_tail=False, standalone=True, ) with io.open(dst_path, mode="wb") as f: f.write(content)