Source code for docx.xml_redactor

import zipfile
import os
import shutil
import tempfile
import atexit
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, ElementTree
from typing import Union, Tuple, Dict, List
from src.exceptions.docx_exceptions import *
from src.docx.enum.schemas import schemas



[docs]
class XMLRedactor:
    """
    For correct work this class you should edit xml.etree.ElementTree
    replace 862 line on : "return qnames, _namespace_map"
    Else DOCX-file after editing will contain wrong namespace
    """
    def __init__(self, path: str):
        if len(path) < 5 or len(path) > 255:
            raise NameError(
                f'\'{path}\' can\'t be open'
            )
        if path[-5:] != '.docx':
            raise NotSupportedFormat(
                path
            )
        self.path: str = path
        self._temp_dir: str = tempfile.mkdtemp()
        self.encoding: str = 'utf-8'
        atexit.register(self.__rm_temp_dir)
        self.__extract_files()

    def __rm_temp_dir(self):
        """
        Remove temp directory, when program is interrupted
        """
        if self._temp_dir:
            shutil.rmtree(self._temp_dir, ignore_errors=True)

    def __attrib(self, element: Element, key: str) -> str:
        try:
            return element.attrib[key]
        except KeyError as _ke:
            raise InvalidAttributeKey(element.tag, key)

    def __extract_files(self):
        """
        Extracting files in temp directory
        """
        with zipfile.ZipFile(self.path, 'r') as zr:
            zr.extractall(self._temp_dir)

    def __get_ilvl_marking(self) -> Dict[int, Tuple[int, int]]:
        """
        Getting default ilvl marking
        """
        return {
            0: (720, 360),
            1: (1440, 360),
            2: (2160, 360),
            3: (2880, 360),
            4: (3600, 360),
            5: (4320, 360),
            6: (5040, 360),
            7: (5760, 360),
            8: (6480, 360)
        }

    def __get_tree(self, path: str) -> ElementTree:
        try:
            return ET.parse(os.path.join(self._temp_dir, 'word', path))
        except FileNotFoundError as _fe:
            raise FileDoesNotContainXMLFile(self.path, path)


[docs]
    def delete_comment_by_id(self, comment_id: Union[int, str]):
        """
        :param comment_id: id of comment, that should be deleted
        """
        for k, v in schemas.to_namespace.items():
            ET.register_namespace(k, v)
        tree = self.__get_tree('document.xml')
        root = tree.getroot()
        for para in root.iter(f'{{{schemas.w}}}p'):
            for element in para:
                if element.tag == f'{{{schemas.w}}}commentRangeStart':
                    if self.__attrib(element, f'{{{schemas.w}}}id') == str(comment_id):
                        para.remove(element)
                if element.tag == f'{{{schemas.w}}}commentRangeEnd':
                    if self.__attrib(element, f'{{{schemas.w}}}id') == str(comment_id):
                        para.remove(element)
                if element.tag == f'{{{schemas.w}}}r':
                    for sub_elem in element:
                        if sub_elem.tag == f'{{{schemas.w}}}commentReference':
                            if self.__attrib(sub_elem, f'{{{schemas.w}}}id') == str(comment_id):
                                para.remove(element)
        tree.write(os.path.join(self._temp_dir, 'word', 'document.xml'), xml_declaration=True,
                   encoding=self.encoding)



[docs]
    def edit_comment_by_id(self, comment_id: Union[int, str], comment_text: str, new_author: str = None):
        """
        :param comment_id: id of comment, that should be edited
        :param comment_text: new text of comment
        :param new_author: new author of comment
        """
        para_count = 0
        remove_queue = {}
        for k, v in schemas.to_namespace.items():
            ET.register_namespace(k, v)
        tree = self.__get_tree('comments.xml')
        root = tree.getroot()
        for comment in root:
            if self.__attrib(comment, f'{{{schemas.w}}}id') == str(comment_id):
                if new_author is not None:
                    comment.attrib[f'{{{schemas.w}}}author'] = new_author
                for paragraph in comment:
                    if para_count == 1:
                        remove_queue[paragraph] = comment
                    else:
                        para_count += 1
                        for elem in paragraph:
                            if elem.tag == f'{{{schemas.w}}}r':
                                for sub_elem in elem:
                                    if sub_elem.tag == f'{{{schemas.w}}}t':
                                        sub_elem.text = comment_text

        for para, comm in remove_queue.items():
            comm.remove(para)

        tree.write(os.path.join(self._temp_dir, 'word', 'comments.xml'), xml_declaration=True,
                   encoding=self.encoding)




[docs]
    def save(self, path: str = None):
        """
        Save changes to files
        :param path: path to new files
        """
        if path is None:
            path = self.path
        with zipfile.ZipFile(path, 'w') as zw:
            for folder, sub_folder, file_names in os.walk(self._temp_dir):
                for file_name in file_names:
                    file_path = os.path.join(folder, file_name)
                    zw.write(file_path, os.path.relpath(file_path, self._temp_dir))



[docs]
    def add_new_abstract_and_num(self, list_style: bool = True, levels_count: int = 1,
                                 bullet_symbol: str = '•',
                                 custom_ilvl_marking: Dict[int, Tuple[int, int]] = None) -> int:
        """
        This function added new abstractNum and num
        list_style -- True: decimal, False: bullet
        Return id of new Num-object
        """
        for k, v in schemas.to_namespace.items():
            ET.register_namespace(k, v)
        tree = self.__get_tree('numbering.xml')
        root = tree.getroot()
        max_abstract = -1
        max_num = 0
        if not (0 <= levels_count <= 8):
            raise ILvlDoesNotExist(levels_count)
        if custom_ilvl_marking is None:
            custom_ilvl_marking = self.__get_ilvl_marking()
        for abstract_num in root.iter(f'{{{schemas.w}}}abstractNum'):
            max_abstract = max(max_abstract, int(abstract_num.attrib[f'{{{schemas.w}}}abstractNumId']))
        for num in root.iter(f'{{{schemas.w}}}num'):
            max_num = max(max_num, int(num.attrib[f'{{{schemas.w}}}numId']))
        insert_abstract = ET.Element(f'{{{schemas.w}}}abstractNum', {
            f'{{{schemas.w}}}abstractNumId': str(max_abstract + 1),
            f'{{{schemas.w15}}}restartNumberingAfterBreak': '0'
        })
        insert_abstract.append(ET.Element(f'{{{schemas.w}}}multiLevelType', {
            f'{{{schemas.w}}}val': 'hybridMultilevel'
        }))
        for i in range(levels_count):
            try:
                insert_abstract.append(self.__create_ilvl(list_style, i, bullet_symbol,
                                                          custom_ilvl_marking[i]))
            except KeyError as _ke:
                raise IncorrectILvlMarkingDict(i)

        insert_num = ET.Element(f'{{{schemas.w}}}num', {
            f'{{{schemas.w}}}numId': str(max_num + 1),
            f'{{{schemas.w16cid}}}durableId': '0'
        })
        insert_num.append(ET.Element(f'{{{schemas.w}}}abstractNumId', {
            f'{{{schemas.w}}}val': str(max_abstract + 1)
        }))
        root.insert(len(root.findall(f'.//{{{schemas.w}}}abstractNum')), insert_abstract)
        root.append(insert_num)

        tree.write(os.path.join(self._temp_dir, 'word', 'numbering.xml'), xml_declaration=True,
                   encoding=self.encoding)
        return max_num + 1


    def __create_ilvl(self, is_decimal: bool, i_lvl: int = 0, bullet_symbol: str = '•',
                      ilvl_marking: Tuple[int, int] = None) -> Element:
        """
        Create ilvl-Element for new abstractNum
        """
        i_lvl = ET.Element(f'{{{schemas.w}}}lvl', {f'{{{schemas.w}}}ilvl': str(i_lvl)})
        i_lvl.append(ET.Element(f'{{{schemas.w}}}start', {f'{{{schemas.w}}}val': '1'}))
        i_lvl.append(ET.Element(f'{{{schemas.w}}}numFmt',
                                {f'{{{schemas.w}}}val': 'decimal' if is_decimal else 'bullet'}))
        i_lvl.append(ET.Element(f'{{{schemas.w}}}lvlText',
                                {f'{{{schemas.w}}}val': '%1.' if is_decimal else bullet_symbol}))
        i_lvl.append(ET.Element(f'{{{schemas.w}}}lvlJc', {f'{{{schemas.w}}}val': 'left'}))
        temp_lem = ET.Element(f'{{{schemas.w}}}pPr')
        temp_lem.append(ET.Element(
            f'{{{schemas.w}}}ind',
            {f'{{{schemas.w}}}left': str(ilvl_marking[0]), f'{{{schemas.w}}}hanging': str(ilvl_marking[1])}
        ))
        i_lvl.append(temp_lem)
        temp_lem = ET.Element(f'{{{schemas.w}}}rPr')
        temp_lem.append(ET.Element(f'{{{schemas.w}}}rFonts',
                                   {f'{{{schemas.w}}}hint': 'default'}))
        i_lvl.append(temp_lem)
        return i_lvl


[docs]
    def edit_list_style_by_paraIds(self, paraIds: Union[List[str], str],
                                   new_num: Union[str, int]):
        """
        Replace the numId to newNum of all specified paragraphs
        :param paraIds: One or many paraIds, which we replace numId
        :param new_num: New numId
        :return: None
        """
        if isinstance(paraIds, str):
            paraIds = [paraIds]
        for k, v in schemas.to_namespace.items():
            ET.register_namespace(k, v)
        tree = self.__get_tree('document.xml')
        root = tree.getroot()
        for paragraph in root.iter(f'{{{schemas.w}}}p'):
            if self.__attrib(paragraph, f'{{{schemas.w14}}}paraId') in paraIds:
                try:
                    cnt = 0
                    for num in paragraph.iter(f'{{{schemas.w}}}numId'):
                        num.attrib[f'{{{schemas.w}}}val'] = str(new_num)
                        cnt += 1
                    if cnt == 0:
                        raise KeyError()
                except KeyError as _ke:
                    raise ParagraphDoesNotContainNumPr(paragraph.attrib[f'{{{schemas.w14}}}paraId'])
        tree.write(os.path.join(self._temp_dir, 'word', 'document.xml'), xml_declaration=True,
                   encoding='utf-8')



[docs]
    def get_all_para_attributes(self) -> List[Dict[str, str]]:
        """
        Get all paragraphs in word/document.xml with their attributes
        :return: List of attributes with their values
        """
        tree = self.__get_tree('document.xml')
        root = tree.getroot()
        paragraphs = []
        for paragraph in root.iter(f'{{{schemas.w}}}p'):
            paragraphs.append(paragraph.attrib)
        return paragraphs
Source code for docx.xml_redactor

Edulytica-PythonDM

Navigation

Related Topics