Source code for google.appengine.ext.bulkload.simplexml_connector

#!/usr/bin/env python
#
# Copyright 2007 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#





"""Bulkloader XML reading and writing.

Handle the XML format specified in a bulkloader.yaml file.
"""











import codecs
import logging
import re

from xml.etree import cElementTree as ElementTree
from xml.sax import saxutils

from google.appengine.ext.bulkload import bulkloader_errors
from google.appengine.ext.bulkload import connector_interface


NODE_PATH_ONLY_RE = '(/[a-zA-Z][a-zA-Z0-9]*)+$'


[docs]class SimpleXmlConnector(connector_interface.ConnectorInterface): """Read/write a simply-structured XML file and convert dicts for each record. A simply-structed XML file is one where we can locate all interesting nodes with a simple (ElementTree supported) xpath, and each node contains either all the info we care about as child (and not grandchild) nodes with text or as attributes. We'll also pass the entire node in case the developer wants to do something more interesting with it (occasional grandchildren, parents, etc.). This is of course a fairly expensive way to read XML--we build a DOM, then copy parts of it into a dict. A pull model would work well with the interface too. """ ELEMENT_CENTRIC = 1 ATTRIBUTE_CENTRIC = 2 @classmethod
[docs] def create_from_options(cls, options, name): """Factory using an options dictionary. Args: options: Dictionary of options. Must contain: * xpath_to_nodes: The xpath to select a record. * style: 'element_centric' or 'attribute_centric' name: The name of this transformer, for use in error messages. Returns: XmlConnector connector object described by the specified options. Raises: InvalidConfiguration: If the config is invalid. """ xpath_to_nodes = options.get('xpath_to_nodes') if not xpath_to_nodes: raise bulkloader_errors.InvalidConfiguration( 'simplexml must specify xpath_to_nodes. (In transformer named %s)' % name) if not re.match(NODE_PATH_ONLY_RE, xpath_to_nodes): logging.warning('simplexml export only supports very simple ' '/root/to/node xpath_to_nodes for now.') xml_style = options.get('style') xml_style_mapping = { 'element_centric': cls.ELEMENT_CENTRIC, 'attribute_centric': cls.ATTRIBUTE_CENTRIC, } if xml_style not in xml_style_mapping: raise bulkloader_errors.InvalidConfiguration( 'simplexml must specify one of these valid xml_style options: "%s". ' 'You specified %s in transformer named %s.' % ('", "'.join(xml_style_mapping.keys()), xml_style, name)) return cls(xpath_to_nodes, xml_style_mapping[xml_style])
def __init__(self, xpath_to_nodes, xml_style): """Constructor. Args: xpath_to_nodes: xpath to the nodes to run over. xml_style: ELEMENT_CENTRIC or ATTRIBUTE_CENTRIC--we'll either convert the list of elements to a dict (last element of the same name will be used) or the list of attributes. Raises: InvalidConfiguration: If the config is invalid. """ self.xpath_to_nodes = xpath_to_nodes assert xml_style in (self.ELEMENT_CENTRIC, self.ATTRIBUTE_CENTRIC) self.xml_style = xml_style self.output_stream = None self.bulkload_state = None self.depth = 0 if re.match(NODE_PATH_ONLY_RE, xpath_to_nodes): self.node_list = self.xpath_to_nodes.split('/')[1:] self.entity_node = self.node_list[-1] self.node_list = self.node_list[:-1] else: self.node_list = None self.entity_node = None self.node_list = None
[docs] def generate_import_record(self, filename, bulkload_state): """Generator, yields dicts for nodes found as described in the options.""" self.bulkload_state = bulkload_state tree = ElementTree.parse(filename) xpath_to_nodes = self.xpath_to_nodes if (len(xpath_to_nodes) > 1 and xpath_to_nodes[0] == '/' and xpath_to_nodes[1] != '/'): if not tree.getroot().tag == xpath_to_nodes.split('/')[1]: return xpath_to_nodes = '/' + xpath_to_nodes.split('/', 2)[2] nodes = tree.findall(xpath_to_nodes) for node in nodes: if self.xml_style == self.ELEMENT_CENTRIC: input_dict = {} for child in node.getchildren(): if not child.tag in input_dict: input_dict[child.tag] = child.text else: input_dict = dict(node.items()) input_dict['__node__'] = node yield input_dict
[docs] def initialize_export(self, filename, bulkload_state): """Initialize the output file.""" self.bulkload_state = bulkload_state if not self.node_list: raise bulkloader_errors.InvalidConfiguration( 'simplexml export only supports simple /root/to/node xpath_to_nodes ' 'for now.') self.output_stream = codecs.open(filename, 'wb', 'utf-8') self.output_stream.write('<?xml version="1.0"?>\n') self.depth = 0 for node in self.node_list: self.output_stream.write('%s<%s>\n' % (' ' * self.depth, node)) self.depth += 1 self.indent = ' ' * self.depth
[docs] def write_iterable_as_elements(self, values): """Write a dict as elements, possibly recursively.""" if isinstance(values, dict): values = values.iteritems() for (name, value) in values: if isinstance(value, basestring): self.output_stream.write('%s <%s>%s</%s>\n' % (self.indent, name, saxutils.escape(value), name)) else: self.output_stream.write('%s <%s>\n' % (self.indent, name)) self.depth += 1 self.indent = ' ' * self.depth self.write_iterable_as_elements(value) self.depth -= 1 self.indent = ' ' * self.depth self.output_stream.write('%s </%s>\n' % (self.indent, name))
[docs] def write_dict(self, dictionary): """Write one record for the specified entity.""" if self.xml_style == self.ELEMENT_CENTRIC: self.output_stream.write('%s<%s>\n' % (self.indent, self.entity_node)) self.write_iterable_as_elements(dictionary) self.output_stream.write('%s</%s>\n' % (self.indent, self.entity_node)) else: self.output_stream.write('%s<%s ' % (self.indent, self.entity_node)) for (name, value) in dictionary.iteritems(): self.output_stream.write('%s=%s ' % (name, saxutils.quoteattr(value))) self.output_stream.write('/>\n')
[docs] def finalize_export(self): if not self.output_stream: return for node in reversed(self.node_list): self.depth -= 1 self.output_stream.write('%s</%s>\n' % (' ' * self.depth, node)) self.output_stream.close() self.output_stream = None

Send feedback about...

App Engine standard environment for Python