API reference¶
Module-level functions¶
- parse(xml: str) Document¶
Parse an XML string and return a
Document.- Parameters:
xml – A well-formed XML string.
- Raises:
XmlParseError – If the XML is syntactically malformed.
XmlWellFormednessError – If a well-formedness constraint is violated.
from pyuppsala import parse doc = parse("<root>hello</root>") print(doc.document_element.text_content) # "hello"
- parse_bytes(data: bytes) Document¶
Parse XML from bytes with automatic encoding detection (UTF-8, UTF-16 LE/BE).
- Parameters:
data – Raw bytes of an XML document.
- Raises:
XmlParseError – If the XML is malformed.
from pyuppsala import parse_bytes # UTF-8 doc = parse_bytes(b"<root>hello</root>") # UTF-16 LE with BOM data = b"\xff\xfe<\x00r\x00o\x00o\x00t\x00/\x00>\x00" doc = parse_bytes(data)
Document¶
- class Document(xml: str)¶
Parse an XML string into a DOM document.
- Parameters:
xml – A well-formed XML string.
from pyuppsala import Document doc = Document("<catalog><book>XML Guide</book></catalog>") print(doc.document_element.tag.local_name) # "catalog"
- static from_bytes(data: bytes) Document¶
Parse XML from bytes with automatic encoding detection.
doc = Document.from_bytes(b"<root/>")
- static empty() Document¶
Create a new empty document with no document element.
doc = Document.empty() root = doc.create_element("root") doc.append_child(doc.root, root) print(doc.to_xml()) # "<root/>"
- root: Node¶
The root node (the Document node itself). This is the parent of the document element, processing instructions, and comments at the top level.
- document_element: Node | None¶
The document element (top-level element), or
Nonefor empty documents.
- input_text: str¶
The original input text that was parsed to create this document. Returns an empty string for programmatically constructed documents.
xml = "<root>hello</root>" doc = Document(xml) assert doc.input_text == xml empty = Document.empty() assert empty.input_text == ""
- get_elements_by_tag_name(name: str) list[Node]¶
Find all elements in the document with the given local tag name.
doc = Document("<root><a/><b/><a/></root>") elements = doc.get_elements_by_tag_name("a") print(len(elements)) # 2
- get_elements_by_tag_name_ns(namespace_uri: str, name: str) list[Node]¶
Find all elements with the given namespace URI and local tag name.
doc = Document('<root xmlns:x="urn:ex"><x:item/><x:item/></root>') items = doc.get_elements_by_tag_name_ns("urn:ex", "item") print(len(items)) # 2
Tree mutation
- create_element(local_name, namespace_uri=None, prefix=None) Node¶
Create a new element node. The node is not yet attached to the tree; use
append_child(),insert_before(), orinsert_after()to place it.doc = Document.empty() root = doc.create_element("root") doc.append_child(doc.root, root) # With namespace prefix child = doc.create_element("item", namespace_uri="urn:ex", prefix="x") doc.append_child(root, child) print(doc.to_xml()) # '<root><x:item/></root>'
- create_text(text: str) Node¶
Create a new text node.
doc = Document("<root/>") root = doc.document_element text = doc.create_text("hello world") doc.append_child(root, text) print(doc.to_xml()) # "<root>hello world</root>"
- create_comment(text: str) Node¶
Create a new comment node.
doc = Document("<root/>") comment = doc.create_comment(" generated by pyuppsala ") doc.append_child(doc.document_element, comment) print(doc.to_xml()) # "<root><!-- generated by pyuppsala --></root>"
- create_cdata(text: str) Node¶
Create a new CDATA section node.
doc = Document("<root/>") cdata = doc.create_cdata("function() { return 1 < 2; }") doc.append_child(doc.document_element, cdata) print(doc.to_xml()) # <root><![CDATA[function() { return 1 < 2; }]]></root>
- create_processing_instruction(target: str, data: str | None = None) Node¶
Create a new processing instruction node.
doc = Document("<root/>") pi = doc.create_processing_instruction("xml-stylesheet", 'type="text/xsl" href="style.xsl"') doc.insert_before(doc.root, pi, doc.document_element) print(doc.to_xml()) # <?xml-stylesheet type="text/xsl" href="style.xsl"?><root/>
- append_child(parent: Node, child: Node) None¶
Append child as the last child of parent.
doc = Document("<root><a/></root>") root = doc.document_element b = doc.create_element("b") doc.append_child(root, b) print(doc.to_xml()) # "<root><a/><b/></root>"
- insert_before(parent: Node, new_child: Node, reference: Node) None¶
Insert new_child before reference under parent.
doc = Document("<root><b/></root>") root = doc.document_element a = doc.create_element("a") doc.insert_before(root, a, root.children[0]) print(doc.to_xml()) # "<root><a/><b/></root>"
- insert_after(parent: Node, new_child: Node, reference: Node) None¶
Insert new_child after reference under parent.
doc = Document("<root><a/></root>") root = doc.document_element b = doc.create_element("b") doc.insert_after(root, b, root.children[0]) print(doc.to_xml()) # "<root><a/><b/></root>"
- remove_child(parent: Node, child: Node) None¶
Remove child from parent.
doc = Document("<root><a/><b/></root>") root = doc.document_element a = root.children[0] doc.remove_child(root, a) print(doc.to_xml()) # "<root><b/></root>"
- replace_child(parent: Node, new_child: Node, old_child: Node) None¶
Replace old_child with new_child under parent.
doc = Document("<root><old/></root>") root = doc.document_element new = doc.create_element("new") doc.replace_child(root, new, root.children[0]) print(doc.to_xml()) # "<root><new/></root>"
- detach(node: Node) None¶
Detach node from its parent. The node remains valid and can be re-attached elsewhere.
doc = Document("<root><a/><b/><c/></root>") root = doc.document_element b = root.children[1] doc.detach(b) print(len(root.children)) # 2 # Re-attach at the end doc.append_child(root, b) print(doc.to_xml()) # "<root><a/><c/><b/></root>"
Serialization
- to_xml() str¶
Serialize the document to a compact XML string.
doc = Document("<root> <child/> </root>") print(doc.to_xml()) # "<root> <child/> </root>"
- to_xml_with_options(indent=None, expand_empty_elements=False) str¶
Serialize with formatting options.
- Parameters:
indent – Indentation string (e.g.
" "), orNonefor compact output.expand_empty_elements – If
True, write<foo></foo>instead of<foo/>.
doc = Document("<root><a/><b>text</b></root>") # Pretty-print with 2-space indent print(doc.to_xml_with_options(indent=" ")) # <root> # <a/> # <b>text</b> # </root> # Expand empty elements (useful for HTML compatibility) print(doc.to_xml_with_options(expand_empty_elements=True)) # "<root><a></a><b>text</b></root>"
- write_to_file(path: str) None¶
Write the serialized document to a file.
doc = Document("<config><setting>value</setting></config>") doc.write_to_file("/tmp/config.xml")
XPath
- prepare_xpath() None¶
Build internal indices required for XPath evaluation. Call this once before using
XPathEvaluatoron this document. If you modify the DOM after calling this, call it again.doc = Document('<root attr="val"><child/></root>') doc.prepare_xpath() # Now XPath queries can access attribute nodes
Node¶
- class Node¶
A lightweight handle to a node in a
Document. Nodes do not own their data – theDocumentdoes. Do not use aNodeafter its parentDocumenthas been garbage-collected.- kind: str¶
The kind of this node. One of:
"document","element","text","comment","processing_instruction","cdata","attribute".doc = Document("<root>text<!-- comment --></root>") root = doc.document_element print(root.kind) # "element" print(root.children[0].kind) # "text" print(root.children[1].kind) # "comment"
- tag: QName | None¶
The tag name for element nodes, or
Nonefor other kinds.doc = Document('<ns:root xmlns:ns="urn:ex"/>') root = doc.document_element print(root.tag.local_name) # "root" print(root.tag.namespace_uri) # "urn:ex" print(root.tag.prefix) # "ns" print(root.tag.prefixed_name) # "ns:root"
- text: str | None¶
The text content for text, comment, and CDATA nodes, or
None.doc = Document("<root>hello</root>") text_node = doc.document_element.children[0] print(text_node.text) # "hello" print(doc.document_element.text) # None (element, not text)
- text_content: str¶
Recursively collected text content of this node and all descendants.
doc = Document("<p>Hello <b>world</b>!</p>") print(doc.document_element.text_content) # "Hello world!"
- element_text: str | None¶
The text of the first Text or CDATA child, or
None. This is a fast way to get the text content of simple elements like<name>value</name>without recursing into descendants.doc = Document("<root><name>Alice</name><bio>A <b>bold</b> person</bio></root>") root = doc.document_element name = root.children[0] bio = root.children[1] print(name.element_text) # "Alice" print(bio.element_text) # "A " (only first text child, not recursive) print(bio.text_content) # "A bold person" (recursive)
- attributes: list[Attribute]¶
The list of attributes for element nodes (empty list for non-elements).
doc = Document('<item id="1" status="active"/>') for attr in doc.document_element.attributes: print(f"{attr.name}: {attr.value}") # id: 1 # status: active
- parent: Node | None¶
The parent node, or
Nonefor the document root.doc = Document("<root><child/></root>") child = doc.document_element.children[0] print(child.parent.tag.local_name) # "root"
- children: list[Node]¶
The child nodes.
doc = Document("<root><a/><b/><c/></root>") for child in doc.document_element.children: print(child.tag.local_name) # a, b, c
- column: int¶
The column number of this node in the original source.
xml = "<root>\n <child/>\n</root>" doc = Document(xml) child = doc.document_element.children[0] print(f"line {child.line}, column {child.column}") # line 2, column 3
- source_range: tuple[int, int] | None¶
The byte range
(start, end)of this node in the original source. ReturnsNonefor programmatically created nodes.xml = "<root><child>text</child></root>" doc = Document(xml) child = doc.document_element.children[0] start, end = child.source_range print(xml[start:end]) # "<child>text</child>"
- source: str | None¶
The original source text of this node. Returns
Nonefor programmatically created nodes.xml = '<root><item id="1">hello</item></root>' doc = Document(xml) item = doc.document_element.children[0] print(item.source) # '<item id="1">hello</item>' # Programmatically created nodes have no source new_elem = doc.create_element("new") print(new_elem.source) # None
- get_attribute(name: str, namespace_uri: str | None = None) str | None¶
Get an attribute value by local name, optionally filtered by namespace.
doc = Document('<item id="42" xml:lang="en"/>') item = doc.document_element print(item.get_attribute("id")) # "42" # With namespace print(item.get_attribute("lang", namespace_uri="http://www.w3.org/XML/1998/namespace")) # "en" # Missing attribute print(item.get_attribute("missing")) # None
- set_attribute(name, value, namespace_uri=None, prefix=None) str | None¶
Set an attribute. Returns the previous value, or
None.doc = Document('<item id="1"/>') item = doc.document_element # Set a new attribute old = item.set_attribute("status", "active") print(old) # None # Update an existing attribute old = item.set_attribute("id", "2") print(old) # "1" print(doc.to_xml()) # '<item id="2" status="active"/>'
- remove_attribute(name: str) str | None¶
Remove an attribute by local name. Returns the old value, or
None.doc = Document('<item id="1" status="draft"/>') item = doc.document_element old = item.remove_attribute("status") print(old) # "draft" print(doc.to_xml()) # '<item id="1"/>'
- to_xml() str¶
Serialize this node and its subtree to XML.
doc = Document("<root><child>hello</child></root>") child = doc.document_element.children[0] print(child.to_xml()) # "<child>hello</child>"
- to_xml_with_options(indent=None, expand_empty_elements=False) str¶
Serialize this subtree with formatting options.
doc = Document("<root><a><b/></a></root>") a = doc.document_element.children[0] print(a.to_xml_with_options(indent=" ")) # <a> # <b/> # </a>
- get_elements_by_tag_name(name: str) list[Node]¶
Find descendant elements by local tag name.
doc = Document("<root><a><b/></a><b/></root>") root = doc.document_element bs = root.get_elements_by_tag_name("b") print(len(bs)) # 2 (finds nested elements too)
- get_elements_by_tag_name_ns(namespace_uri: str, name: str) list[Node]¶
Find descendant elements by namespace URI and local tag name.
doc = Document('<root xmlns:x="urn:ex"><x:item/><item/><x:item/></root>') root = doc.document_element items = root.get_elements_by_tag_name_ns("urn:ex", "item") print(len(items)) # 2 (only the namespaced ones)
- first_child_element_by_name_ns(namespace_uri: str, local_name: str) Node | None¶
Find the first direct child element matching the given namespace URI and local name. Only searches direct children, not descendants.
doc = Document("""\ <root xmlns:a="urn:example"> <a:name>Alice</a:name> <a:age>30</a:age> <a:name>Bob</a:name> </root> """) root = doc.document_element first_name = root.first_child_element_by_name_ns("urn:example", "name") print(first_name.element_text) # "Alice" # Returns None if not found missing = root.first_child_element_by_name_ns("urn:example", "email") print(missing) # None
- child_elements_by_name_ns(namespace_uri: str, local_name: str) list[Node]¶
Find all direct child elements matching the given namespace URI and local name. Only searches direct children, not descendants.
doc = Document("""\ <root xmlns:a="urn:example"> <a:item>first</a:item> <a:other>skip</a:other> <a:item>second</a:item> </root> """) root = doc.document_element items = root.child_elements_by_name_ns("urn:example", "item") for item in items: print(item.element_text) # first # second
- matches_name_ns(namespace_uri: str, local_name: str) bool¶
Check whether this element matches the given namespace URI and local name. Returns
Falsefor non-element nodes.doc = Document('<a:root xmlns:a="urn:example">text</a:root>') root = doc.document_element print(root.matches_name_ns("urn:example", "root")) # True print(root.matches_name_ns("urn:other", "root")) # False # Text nodes always return False text_node = root.children[0] print(text_node.matches_name_ns("urn:example", "root")) # False
Protocols
len(node)returns the number of child nodes.for child in nodeiterates over children.node[i]returns the i-th child (supports negative indices).bool(node)is alwaysTrue.str(node)returnsto_xml().repr(node)returns a short description likeNode(<root>).
doc = Document("<root><a/><b/><c/></root>") root = doc.document_element print(len(root)) # 3 print(root[0].tag.local_name) # "a" print(root[-1].tag.local_name) # "c" for child in root: print(child.tag.local_name, end=" ") # a b c
QName¶
- class QName(local_name, namespace_uri=None, prefix=None)¶
A qualified XML name.
from pyuppsala import QName # Local name only q = QName("root") print(q.local_name) # "root" # With namespace q = QName("item", namespace_uri="urn:example", prefix="ex") print(q.prefixed_name) # "ex:item" print(q.namespace_uri) # "urn:example"
- matches(local_name: str, namespace_uri: str | None = None) bool¶
Check whether this QName matches the given local name and optional namespace URI. Prefix is ignored.
from pyuppsala import QName q = QName("item", namespace_uri="urn:example", prefix="ex") print(q.matches("item", namespace_uri="urn:example")) # True print(q.matches("item", namespace_uri="urn:other")) # False print(q.matches("item")) # False (namespace mismatch) print(q.matches("other", namespace_uri="urn:example")) # False # QName without namespace q2 = QName("root") print(q2.matches("root")) # True print(q2.matches("root", namespace_uri="urn:example")) # False
Equality is by
local_nameandnamespace_uri(prefix is ignored). QNames are hashable.from pyuppsala import QName # Same namespace, different prefix -- equal a = QName("item", namespace_uri="urn:ex", prefix="a") b = QName("item", namespace_uri="urn:ex", prefix="b") print(a == b) # True # Can be used in sets and as dict keys names = {a, b} print(len(names)) # 1
Attribute¶
XPathEvaluator¶
- class XPathEvaluator¶
XPath 1.0 expression evaluator.
from pyuppsala import Document, XPathEvaluator doc = Document("<root><a>1</a><b>2</b><a>3</a></root>") doc.prepare_xpath() xpath = XPathEvaluator()
- add_namespace(prefix: str, uri: str) None¶
Register a namespace prefix for use in XPath expressions.
xpath = XPathEvaluator() xpath.add_namespace("soap", "http://schemas.xmlsoap.org/soap/envelope/") xpath.add_namespace("m", "urn:example")
- evaluate(doc, expr, context=None) list[Node] | bool | float | str¶
Evaluate an XPath expression. The return type depends on the XPath result type:
Node-set ->
list[Node]Boolean ->
boolNumber ->
floatString ->
str
- Parameters:
doc – The
Documentto query (must haveprepare_xpath()called).expr – An XPath 1.0 expression string.
context – Optional context node. Defaults to the document root.
- Raises:
XPathError – If the expression is invalid.
doc = Document("<root><item>A</item><item>B</item></root>") doc.prepare_xpath() xpath = XPathEvaluator() # Node-set nodes = xpath.evaluate(doc, "//item") print(len(nodes)) # 2 # String text = xpath.evaluate(doc, "string(//item[1])") print(text) # "A" # Number count = xpath.evaluate(doc, "count(//item)") print(count) # 2.0 # Boolean has_items = xpath.evaluate(doc, "boolean(//item)") print(has_items) # True # With context node root = doc.document_element first_child = xpath.evaluate(doc, "string(item[1])", context=root) print(first_child) # "A"
- select(doc, expr, context=None) list[Node]¶
Evaluate an XPath expression and return matching nodes. This is a convenience method equivalent to
evaluate()when the result is a node-set.doc = Document('<root><a id="1"/><b/><a id="2"/></root>') doc.prepare_xpath() xpath = XPathEvaluator() nodes = xpath.select(doc, "//a") for node in nodes: print(node.get_attribute("id")) # "1", "2"
XsdValidator¶
- class XsdValidator(schema_xml: str)¶
XSD schema validator. Supports XSD structures and datatypes, 44+ built-in types, facets, complex types, extensions, restrictions, list types, wildcards, substitution groups, identity constraints, and fixed-value constraints.
- Parameters:
schema_xml – An XSD schema as an XML string.
from pyuppsala import XsdValidator schema = """\ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="age" type="xs:positiveInteger"/> </xs:schema> """ validator = XsdValidator(schema)
- static from_file(schema_xml: str, base_path: str) XsdValidator¶
Create a validator that resolves
xs:include,xs:import, andxs:redefinerelative to base_path.import os schema_dir = "/path/to/schemas" with open(os.path.join(schema_dir, "main.xsd")) as f: schema_xml = f.read() validator = XsdValidator.from_file(schema_xml, schema_dir)
- validate(doc: Document) list[ValidationError]¶
Validate a parsed document. Returns a list of errors (empty = valid).
from pyuppsala import Document, XsdValidator schema = """\ <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"> <xs:element name="name" type="xs:string"/> </xs:schema> """ validator = XsdValidator(schema) doc = Document("<name>Alice</name>") errors = validator.validate(doc) print(len(errors)) # 0
- validate_str(xml: str) list[ValidationError]¶
Parse and validate an XML string in one step.
errors = validator.validate_str("<name>Alice</name>") print(len(errors)) # 0
- is_valid(doc: Document) bool¶
Quick boolean check.
doc = Document("<name>Alice</name>") print(validator.is_valid(doc)) # True
- is_valid_str(xml: str) bool¶
Quick boolean check from a string. Returns
Falsefor malformed XML instead of raising.print(validator.is_valid_str("<name>Alice</name>")) # True print(validator.is_valid_str("<name><bad/></name>")) # False print(validator.is_valid_str("<unclosed")) # False (malformed)
- set_enforce_qname_length_facets(enforce: bool) None¶
Configure whether length facets on QName/NOTATION types are enforced. Enabled by default. See W3C Bug #4009.
ValidationError¶
- class ValidationError¶
A single XSD validation error.
errors = validator.validate_str("<age>-5</age>") for err in errors: print(f"Line {err.line}, Col {err.column}: {err.message}") print(repr(err)) # ValidationError('...', line=1, column=1) print(str(err)) # "1:1: ..."
XmlWriter¶
- class XmlWriter¶
An imperative XML builder for constructing XML fragments without a DOM.
from pyuppsala import XmlWriter w = XmlWriter() w.write_declaration() w.start_element("root") w.text("hello") w.end_element("root") print(w.to_string()) # <?xml version="1.0" encoding="UTF-8"?><root>hello</root>
- write_declaration_full(version='1.0', encoding=None, standalone=None) None¶
Write an XML declaration with custom parameters.
w = XmlWriter() w.write_declaration_full("1.0", encoding="ISO-8859-1", standalone=True) # <?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>
- start_element(name, attrs=None) None¶
Start an element. attrs is an optional list of
(name, value)tuples.w = XmlWriter() w.start_element("div", [("class", "container"), ("id", "main")]) w.text("content") w.end_element("div") print(w.to_string()) # '<div class="container" id="main">content</div>'
- empty_element(name, attrs=None) None¶
Write a self-closing element:
<name/>.w = XmlWriter() w.empty_element("br") w.empty_element("img", [("src", "photo.jpg"), ("alt", "Photo")]) print(w.to_string()) # '<br/><img src="photo.jpg" alt="Photo"/>'
- empty_element_expanded(name, attrs=None) None¶
Write an expanded empty element:
<name></name>.w = XmlWriter() w.empty_element_expanded("script", [("src", "app.js")]) print(w.to_string()) # '<script src="app.js"></script>'
- text(content: str) None¶
Write text content (auto-escaped).
w = XmlWriter() w.start_element("p") w.text("Price: 5 < 10 & tax > 0") w.end_element("p") print(w.to_string()) # "<p>Price: 5 < 10 & tax > 0</p>"
- cdata(content: str) None¶
Write a CDATA section.
w = XmlWriter() w.start_element("script") w.cdata("if (a < b && c > d) { }") w.end_element("script") print(w.to_string()) # "<script><![CDATA[if (a < b && c > d) { }]]></script>"
- comment(content: str) None¶
Write a comment.
w = XmlWriter() w.comment(" This is a comment ") print(w.to_string()) # "<!-- This is a comment -->"
- processing_instruction(target, data=None) None¶
Write a processing instruction.
w = XmlWriter() w.processing_instruction("xml-stylesheet", 'type="text/xsl" href="style.xsl"') print(w.to_string()) # '<?xml-stylesheet type="text/xsl" href="style.xsl"?>'
- raw(xml: str) None¶
Write raw XML content (not escaped).
w = XmlWriter() w.start_element("root") w.raw("<already>escaped & ready</already>") w.end_element("root")
- to_bytes() bytes¶
Return the accumulated XML as bytes.
w = XmlWriter() w.start_element("root") w.end_element("root") data = w.to_bytes() print(type(data)) # <class 'bytes'>
Protocols
len(writer)returns the number of bytes written so far.bool(writer)isTrueif any content has been written.str(writer)returnsto_string().
w = XmlWriter() print(bool(w)) # False print(len(w)) # 0 w.start_element("root") w.end_element("root") print(bool(w)) # True print(len(w)) # 13
XsdRegex¶
- class XsdRegex(pattern: str)¶
XSD regular expression pattern matcher. XSD regexes are implicitly anchored – they must match the entire input string.
Supported features: alternation (
|), grouping, quantifiers (*,+,?,{n},{n,m}), character classes with subtraction ([a-z-[aeiou]]), Unicode category escapes (\p{Lu}), Unicode block escapes (\p{IsBasicLatin}), multi-char escapes (\d,\s,\w,\i,\c).- Parameters:
pattern – An XSD regex pattern string.
- Raises:
ValueError – If the pattern is invalid.
from pyuppsala import XsdRegex # Email-like pattern email = XsdRegex(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") print(email.is_match("user@example.com")) # True print(email.is_match("not-an-email")) # False