Python - working with xml/lxml/objectify/schemas, datatypes, and assignments

aapost aapost at idontexist.club
Sun Jan 15 20:13:35 EST 2023


On 1/3/23 22:57, aapost wrote:
> I am trying to wrap my head around how one goes about working with and 
> editing xml elements ... Back to 
> contemplating and tinkering..

For anyone in a similar situation, xmlschema is actually quite nice.
It didn't have the features I was looking for out of the box, but it 
does have a to_objects function and I have learned quite a bit while 
picking it apart. I am able to patch it to be good enough for my 
requirements.
Below is the patch for anyone interested:
#
# Contribution for the xmlschema & elementpath python modules which are
# Copyright (c), 2016-2020, SISSA (International School for Advanced 
Studies).
# All rights reserved.
#
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# Patching and expansion of the xmlschema.dataobjects.DataElement object 
features
# to get the best demonstration, change schema variable to your .xsd, 
and xmlobj to your .xml files
# then run this as $ python -i filename.py
from typing import Any, Optional, Union, Tuple
#from types import MethodType
class ValueLockedError(Exception):
 def __init__(self, obj, variable_name):
 self.message = "Can't set ." + variable_name + \
 "\nThe object:\n" + str(obj) + \
 "\nis Locked (._locked is set to True)"
 super().__init__(self.message)
# importing in order necessary for intended monkey patch
import elementpath.etree as ep_etree
# Monkey patching additional static functions to the import of 
elementpath.etree
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_remove_registered_namespace(elem: ep_etree.ElementProtocol,
 uri: str = '') -> None:
 etree_module: Any
 if not ep_etree.is_etree_element(elem):
 raise TypeError(f"{elem!r} is not an Element")
 elif isinstance(elem, ep_etree.PyElementTree.Element):
 etree_module = ep_etree.PyElementTree
 elif not hasattr(elem, 'nsmap'):
 etree_module = ep_etree.ElementTree
 else:
 import lxml.etree as etree_module # type: ignore[no-redef]
 if not hasattr(elem, 'nsmap'):
 if uri in etree_module.register_namespace._namespace_map:
 del etree_module.register_namespace._namespace_map[uri]
 else:
 # TODO research this for better understanding
 # _namespace_map is uri->prefix
 # DataElement.nsmap prefix->uri
 # lxml etree .nsmap ?->?
 # not using lxml anyway so not really an issue as
 # this condition shouldn't be met
 for key, value in elem.nsmap.items():
 # research - can there be multiple instances of uri to prefix?..
 # or are they intended to be 1:1?..
 if value == uri:
 if key in elem.nsmap:
 del elem.nsmap[key]
#patching
setattr(ep_etree, "etree_remove_registered_namespace",
 staticmethod(etree_remove_registered_namespace))
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_get_registered_namespaces(elem: ep_etree.ElementProtocol) -> dict:
 etree_module: Any
 if not ep_etree.is_etree_element(elem):
 raise TypeError(f"{elem!r} is not an Element")
 elif isinstance(elem, ep_etree.PyElementTree.Element):
 etree_module = ep_etree.PyElementTree
 elif not hasattr(elem, 'nsmap'):
 etree_module = ep_etree.ElementTree
 else:
 import lxml.etree as etree_module # type: ignore[no-redef]
 if not hasattr(elem, 'nsmap'):
 return etree_module.register_namespace._namespace_map
 else:
 return elem.nsmap # shouldn't be met
#patching
setattr(ep_etree, "etree_get_registered_namespaces",
 staticmethod(etree_get_registered_namespaces))
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_register_namespace(elem: ep_etree.ElementProtocol,
 prefix: str = None,
 uri: str = None) -> None:
 etree_module: Any
 if not ep_etree.is_etree_element(elem):
 raise TypeError(f"{elem!r} is not an Element")
 elif isinstance(elem, ep_etree.PyElementTree.Element):
 etree_module = ep_etree.PyElementTree
 elif not hasattr(elem, 'nsmap'):
 etree_module = ep_etree.ElementTree
 else:
 import lxml.etree as etree_module # type: ignore[no-redef]
 if prefix != None and uri != None:
 if not hasattr(elem, 'nsmap'):
 etree_module.register_namespace(prefix, uri)
 else:
 # TODO research this for better understanding
 # _namespace_map is uri->prefix
 # DataElement.nsmap prefix->uri
 # lxml etree .nsmap ?->?
 # not using lxml anyway so not really an issue as
 # this condition shouldn't be met
 elem.nsmap[prefix] = uri
#patching
setattr(ep_etree, "etree_register_namespace",
 staticmethod(etree_register_namespace))
# importing in order necessary for intended monkey patch
import xmlschema
# Monkey patching additional instance functions to the import of xmlschema
# specifically xmlschema.dataobjects.DataElement
# Instance functions so DataElement object can use above 
elementpath.etree namespace functions
def register_namespace(self, prefix: str = None, uri: str = None) -> None:
 #root = self.encode(validation='strict')
 root, errors = self.encode(validation='lax')
 if prefix != None and uri != None:
 ep_etree.etree_register_namespace(root, prefix, uri)
#patching
setattr(xmlschema.dataobjects.DataElement, "register_namespace", 
register_namespace)
def remove_registered_namespace(self, uri: str = '') -> None:
 #root = self.encode(validation='strict')
 root, errors = self.encode(validation='lax')
 ep_etree.etree_remove_registered_namespace(root, uri)
#patching
setattr(xmlschema.dataobjects.DataElement, 
"remove_registered_namespace", remove_registered_namespace)
def get_registered_namespaces(self) -> dict:
 #root = self.encode(validation='strict')
 root, errors = self.encode(validation='lax')
 return ep_etree.etree_get_registered_namespaces(root)
#patching
setattr(xmlschema.dataobjects.DataElement, "get_registered_namespaces", 
get_registered_namespaces)
# replacing .validate() & .is_valid() on DataElement so that namespaces 
from the DataElement
# get set to the xml.etree.ElementTree register_namespace._namespace_map 
global when used
def validate(self, use_defaults: bool = True,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
 max_depth: Optional[int] = None) -> None:
 """
 Validates the XML data object.
 :raises: :exc:`XMLSchemaValidationError` if XML data object is not valid.
 :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
 """
 if (self.nsmap and namespaces == None): #added code
 namespaces = self.nsmap #added code
 for error in self.iter_errors(use_defaults, namespaces, max_depth):
 raise error
#patching
setattr(xmlschema.dataobjects.DataElement, "validate", validate)
def is_valid(self, use_defaults: bool = True,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
 max_depth: Optional[int] = None) -> bool:
 """
 Like :meth:`validate` except it does not raise an exception on validation
 error but returns ``True`` if the XML data object is valid, ``False`` if
 it's invalid.
 :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
 :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
 """
 if (self.nsmap and namespaces == None): #added code
 namespaces = self.nsmap #added code
 error = next(self.iter_errors(use_defaults, namespaces, max_depth), None)
 return error is None
#patching
setattr(xmlschema.dataobjects.DataElement, "is_valid", is_valid)
# replace .tostring() on DataElement to allow for 
xml_declaration/encoding support
# TODO research more, will likely customize a bit further
def tostring(self,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
 indent: str = '',
 max_lines: Optional[int] = None,
 spaces_for_tab: Optional[int] = None,
 xml_declaration: Optional[bool] = None,
 encoding: str = 'unicode',
 method: str = 'xml') -> Any:
 if (self.nsmap and namespaces == None):
 namespaces = self.nsmap
 # Serializes the data element tree to an XML source string.
 # root, errors = self.encode(validation='lax')
 root = self.encode(validation="strict") #prefer strict on my output 
just in case..
 return ep_etree.etree_tostring(
 root, namespaces, indent, max_lines, spaces_for_tab,
 xml_declaration, encoding, method)
#patching
setattr(xmlschema.dataobjects.DataElement, "tostring", tostring)
# add get_value function - paired with set_value
def get_value(self) -> Any:
 print(type(self))
 return self.value
#patching
setattr(xmlschema.dataobjects.DataElement, "get_value", get_value)
# add set_value function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_value(self,
 value: Any,
 use_defaults: bool = True,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
 max_depth: Optional[int] = None) -> None:
 if hasattr(self, "_locked") and self._locked == True:
 raise ValueLockedError(self, variable_name='value')
 else:
 if hasattr(self, "_locked"):
 self._locked = True
 self._set_value_temp_value = self.value
 self.value = value
 if (self.nsmap and namespaces == None):
 namespaces = self.nsmap
 for error in self.iter_errors(use_defaults, namespaces, max_depth):
 self.value = self._set_value_temp_value # revert value back to 
original
 del self._set_value_temp_value #clean up
 if hasattr(self, "_locked"): # unlock before raising if using/exists
 self._locked = False
 raise error # raise error
 # no errors
 del self._set_value_temp_value # clean up
 if hasattr(self, "_locked"):
 self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "set_value", set_value)
# add get_attrib function - paired with set_attrib
# remove added logic from .get(), requiring explicit matches only
def get_attrib(self, key: str) -> Any:
 return self.attrib[key]
#patching
setattr(xmlschema.dataobjects.DataElement, "get_attrib", get_attrib)
# add set_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_attrib(self,
 key: str,
 value: Any,
 use_defaults: bool = True,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
 max_depth: Optional[int] = None) -> Union[bool, 
Optional[Tuple[bool, str]]]:
 if hasattr(self, "_locked") and self._locked == True:
 raise ValueLockedError(self, variable_name='attrib[' + key + ']')
 else:
 if hasattr(self, "_locked"):
 self._locked = True
 if key in self.attrib:
 self._set_attrib_temp_value = self.attrib[key] # save original 
value if exists
 else:
 self._set_attrib_value_did_not_exist = True # or note if it 
doesn't exist
 self.attrib[key] = value
 if (self.nsmap and namespaces == None):
 namespaces = self.nsmap
 for error in self.iter_errors(use_defaults, namespaces, max_depth):
 if hasattr(self, '_set_attrib_temp_value'):
 self.attrib[key] = self._set_attrib_temp_value # revert value 
back to original if existed
 del self._set_attrib_temp_value
 elif hasattr(self, '_set_attrib_value_did_not_exist'):
 del self.attrib[key] # or just delete if it didn't
 del self._set_attrib_value_did_not_exist
 if hasattr(self, "_locked"):
 self._locked = False
 raise error
 # no errors
 if hasattr(self, '_set_attrib_temp_value'):
 del self._set_attrib_temp_value # clean up
 elif hasattr(self, '_set_attrib_value_did_not_exist'):
 del self._set_attrib_value_did_not_exist # clean up
 # TODO research @property / some or some type of better variable 
binding?
 # self._expand_xDE_attrib_prefix exists if 
expand_xmlschema_DataElement is run
 if hasattr(self, '_expand_xDE_attrib_prefix'):
 setattr(self, self._expand_xDE_attrib_prefix + key, value)
 if hasattr(self, "_locked"):
 self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "set_attrib", set_attrib)
# add del_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
# :raises: :exc:`KeyError` if xml tag attribute (.attrib[key]) doesn't exist
def del_attrib(self,
 key: str,
 use_defaults: bool = True,
 namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
 max_depth: Optional[int] = None) -> Union[bool, 
Optional[Tuple[bool, str]]]:
 if hasattr(self, "_locked") and self._locked == True:
 raise ValueLockedError(self, variable_name='attrib[' + key + ']')
 else:
 if hasattr(self, "_locked"):
 self._locked = True
 if key in self.attrib:
 self._del_attrib_temp_value = self.attrib[key] # save original 
value if exists
 else:
 if hasattr(self, "_locked"):
 self._locked = False
 raise KeyError("'" + key + "' Attribute does not exist, nothing 
to do")
 del self.attrib[key]
 if (self.nsmap and namespaces == None):
 namespaces = self.nsmap
 for error in self.iter_errors(use_defaults, namespaces, max_depth):
 if hasattr(self, '_del_attrib_temp_value'):
 self.attrib[key] = self._del_attrib_temp_value # attribute 
required, recreate value back to original
 del self._del_attrib_temp_value
 if hasattr(self, "_locked"):
 self._locked = False
 # append informational message to error output
 if hasattr(error, "message"):
 error.message += ":\n\nThe attribute value was returned to 
original state due to error" \
 "\n\nThis error represents the state of this 
element IF the attribute were removed"
 raise error
 # no errors
 if hasattr(self, '_del_attrib_temp_value'):
 del self._del_attrib_temp_value # clean up
 # TODO research @property / some or some type of better variable 
binding?
 # self._expand_xDE_attrib_prefix exists if 
expand_xmlschema_DataElement is run
 if hasattr(self, '_expand_xDE_attrib_prefix'):
 delattr(self, self._expand_xDE_attrib_prefix + key)
 if hasattr(self, "_locked"):
 self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "del_attrib", del_attrib)
# Monkey patching some class methods helpful for learning / troubleshooting
@classmethod
def _show_me_mro(cls):
 return cls.mro()
setattr(xmlschema.validators.schemas.XsdValidator, "_show_me_mro", 
classmethod(_show_me_mro))
setattr(xmlschema.dataobjects.DataElement, "_show_me_mro", 
classmethod(_show_me_mro))
schema = xmlschema.XMLSchema("path/to/your.xsd", 
converter=xmlschema.JsonMLConverter)
xmlobj = schema.to_objects("path/to/your.xml")
# creates dot notation naming for all children recursively
# c_ default prefix for child, a_ default prefix for tag attribute
# _# numbered suffix for all children starting at 0
# increases from there if more than 1 child with same name
def expand_xmlschema_DataElement(xsobj: xmlschema.dataobjects.DataElement,
 child_prefix: str = 'c_',
 attrib_prefix: str = 'a_') -> None:
 xsobj._expand_xDE_child_prefix = child_prefix
 xsobj._expand_xDE_attrib_prefix = attrib_prefix
 # _locked just an idea at the moment, may or may not use this in the end
 setattr(xsobj, "_locked", False)
 # set a class attribute for each xml tag attribute
 # DO NOT change these directly, use set_attrib on the parent class 
which changes .attrib first
 # These are currently just a copy of what is in the .attrib dict
 # Validation has no knowledge of their existence if they are changed 
outside of design
 # TODO research @property / or some type of better variable binding?
 if (xsobj.attrib):
 #print(xsobj.local_name + " has attributes")
 for key in xsobj.attrib.keys():
 setattr(xsobj, xsobj._expand_xDE_attrib_prefix + key, 
xsobj.attrib[key])
 # set a class attribute for each child
 for each in xsobj.iterchildren():
 expand_xmlschema_DataElement(each)
 count = 0
 while(True):
 if hasattr(xsobj, xsobj._expand_xDE_child_prefix + 
each.local_name + "_" + str(count)):
 count += 1
 else:
 setattr(xsobj, xsobj._expand_xDE_child_prefix + each.local_name 
+ "_" + str(count), each)
 break
expand_xmlschema_DataElement(xmlobj)


More information about the Python-list mailing list

AltStyle によって変換されたページ (->オリジナル) /