Rewrite the INI InventoryParser

The new code parses INI-format inventory files in a single pass using a
well-documented state machine that reports precise errors and eliminates
the duplications and inconsistencies and outright errors in the earlier
three-phase parsing code (e.g. three ways to skip comments). It is also
much easier now to follow what decisions are being taken on the basis of
the parsed data. The comments point out various potential improvements,
particularly in the area of consistent IPv6 handling.

On the ornate marble tombstone of the old code, the following
inscription is one last baffling memento from a bygone age:

-    def _before_comment(self, msg):
-        ''' what's the part of a string before a comment? '''
-        msg = msg.replace("\#","**NOT_A_COMMENT**")
-        msg = msg.split("#")[0]
-        msg = msg.replace("**NOT_A_COMMENT**","#")
-        return msg
This commit is contained in:
Abhijit Menon-Sen 2015-08-14 16:20:27 +05:30
parent d0350b1905
commit 1284c49bd7

View file

@ -1,4 +1,4 @@
# (c) 2012-2014, Michael DeHaan <michael.dehaan@gmail.com>
# Copyright 2015 Abhijit Menon-Sen <ams@2ndQuadrant.com>
#
# This file is part of Ansible
#
@ -33,28 +33,288 @@ from ansible.utils.unicode import to_unicode
class InventoryParser(object):
"""
Host inventory for ansible.
Takes an INI-format inventory file and builds a list of groups and subgroups
with their associated hosts and variable settings.
"""
def __init__(self, filename=C.DEFAULT_HOST_LIST):
self.filename = filename
# Start with an empty host list and the default 'all' and
# 'ungrouped' groups.
self.hosts = {}
self.patterns = {}
self.groups = dict(
all = Group(name='all'),
ungrouped = Group(name='ungrouped')
)
# Read in the hosts, groups, and variables defined in the
# inventory file.
with open(filename) as fh:
self.lines = fh.readlines()
self.groups = {}
self.hosts = {}
self._parse()
def _parse(self):
# Finally, add all top-level groups (including 'ungrouped') as
# children of 'all'.
self._parse_base_groups()
self._parse_group_children()
self._add_allgroup_children()
self._parse_group_variables()
return self.groups
for group in self.groups.values():
if group.depth == 0 and group.name != 'all':
self.groups['all'].add_child_group(group)
# Note: we could discard self.hosts after this point.
def _parse(self):
'''
Populates self.groups from the contents of self.lines. Raises an error
on any parse failure.
'''
self._compile_patterns()
# We behave as though the first line of the inventory is '[ungrouped]',
# and begin to look for host definitions. We make a single pass through
# each line of the inventory, building up self.groups and adding hosts,
# subgroups, and setting variables as we go.
pending_declarations = {}
section = 'ungrouped'
state = 'hosts'
i = 0
for line in self.lines:
i += 1
# Is there a better way to get rid of the ending \n?
line = line.strip()
# Skip empty lines and comments
if line == '' or line.startswith(";") or line.startswith("#"):
continue
# Is this a [section] header? That tells us what group we're parsing
# definitions for, and what kind of definitions to expect.
m = self.patterns['section'].match(line)
if m:
(section, state) = m.groups()
state = state or 'hosts'
if state not in ['hosts', 'children', 'vars']:
title = ":".join(m.groups())
raise AnsibleError("%s:%d: Section [%s] has unknown type: %s" % (self.filename, i, title, state))
# If we haven't seen this section before, we add a new Group.
#
# Either [groupname] or [groupname:children] is sufficient to
# declare a group, but [groupname:vars] is allowed only if the
# group is declared elsewhere (not necessarily earlier). We add
# the group anyway, but make a note in pending_declarations to
# check at the end.
if section not in self.groups:
self.groups[section] = Group(name=section)
if state == 'vars':
pending_declarations[section] = dict(line=i, state=state, name=section)
# When we see a declaration that we've been waiting for, we can
# delete the note.
if section in pending_declarations and state != 'vars':
del pending_declarations[section]
continue
# It's not a section, so the current state tells us what kind of
# definition it must be. The individual parsers will raise an
# error if we feed them something they can't digest.
# [groupname] contains host definitions that must be added to
# the current group.
if state == 'hosts':
hosts = self._parse_host_definition(line, i)
for h in hosts:
self.groups[section].add_host(h)
# [groupname:vars] contains variable definitions that must be
# applied to the current group.
elif state == 'vars':
(k, v) = self._parse_variable_definition(line, i)
self.groups[section].set_variable(k, v)
# [groupname:children] contains subgroup names that must be
# added as children of the current group. The subgroup names
# must themselves be declared as groups, but as before, they
# may only be declared later.
elif state == 'children':
child = self._parse_group_name(line, i)
if child not in self.groups:
self.groups[child] = Group(name=child)
pending_declarations[child] = dict(line=i, state=state, name=child, parent=section)
self.groups[section].add_child_group(self.groups[child])
# Note: there's no reason why we couldn't accept variable
# definitions here, and set them on the named child group.
# This is a fencepost. It can happen only if the state checker
# accepts a state that isn't handled above.
else:
raise AnsibleError("%s:%d: Entered unhandled state: %s" % (self.filename, i, state))
# Any entries in pending_declarations not removed by a group declaration
# above mean that there was an unresolved forward reference. We report
# only the first such error here.
for g in pending_declarations:
decl = pending_declarations[g]
if decl['state'] == 'vars':
raise AnsibleError("%s:%d: Section [%s:vars] not valid for undefined group: %s" % (self.filename, decl['line'], decl['name'], decl['name']))
elif decl['state'] == 'children':
raise AnsibleError("%s:%d: Section [%s:children] includes undefined group: %s" % (self.filename, decl['line'], decl['parent'], decl['name']))
def _parse_group_name(self, line, i):
'''
Takes a single line and tries to parse it as a group name. Returns the
group name if successful, or raises an error.
'''
m = self.patterns['groupname'].match(line)
if m:
return m.group(1)
raise AnsibleError("%s:%d: Expected group name, got: %s" % (self.filename, i, line))
def _parse_variable_definition(self, line, i):
'''
Takes a string and tries to parse it as a variable definition. Returns
the key and value if successful, or raises an error.
'''
# TODO: We parse variable assignments as a key (anything to the left of
# an '='"), an '=', and a value (anything left) and leave the value to
# _parse_value to sort out. We should be more systematic here about
# defining what is acceptable, how quotes work, and so on.
if '=' in line:
(k, v) = [e.strip() for e in line.split("=", 1)]
return (k, self._parse_value(v))
raise AnsibleError("%s:%d: Expected key=value, got: %s" % (self.filename, i, line))
def _parse_host_definition(self, line, i):
'''
Takes a single line and tries to parse it as a host definition. Returns
a list of Hosts if successful, or raises an error.
'''
# A host definition comprises (1) a non-whitespace hostname or range,
# optionally followed by (2) a series of key="some value" assignments.
# We ignore any trailing whitespace and/or comments. For example, here
# are a series of host definitions in a group:
#
# [groupname]
# alpha
# beta:2345 user=admin # we'll tell shlex
# gamma sudo=True user=root # to ignore comments
try:
tokens = shlex.split(line, comments=True)
except ValueError as e:
raise AnsibleError("%s:%d: Error parsing host definition '%s': %s" % (self.filename, i, varstring, e))
(hostnames, port) = self._expand_hostpattern(tokens[0])
hosts = self._Hosts(hostnames, port)
# Try to process anything remaining as a series of key=value pairs.
variables = {}
for t in tokens[1:]:
if '=' not in t:
raise AnsibleError("%s:%d: Expected key=value host variable assignment, got: %s" % (self.filename, i, t))
(k, v) = t.split('=', 1)
variables[k] = self._parse_value(v)
# Apply any variable settings found to every host.
for h in hosts:
for k in variables:
h.set_variable(k, variables[k])
if k == 'ansible_ssh_host':
h.ipv4_address = variables[k]
return hosts
def _expand_hostpattern(self, pattern):
'''
Takes a single host pattern and returns a list of hostnames and an
optional port number that applies to all of them.
'''
# First, we extract the port number. This is usually ":NN" at the end of
# the expression, but for IPv6 addresses it's ".NN" instead. In either
# case, we remove it.
port = None
if ':' in pattern:
pos = pattern.rindex(':')
try:
port = int(pattern[pos+1:])
pattern = pattern[0:pos]
except ValueError:
pass
else:
m = self.patterns['ipv6_hostport'].match(pattern)
if m:
(pattern, port) = m.groups()
# We're done, because we know this is a single IPv6 address.
# But should we support ranges for IPv6 address generation?
# See the FIXME note below. We should probably just accept
# "[xxx]:nn" syntax instead, and then let xxx be expanded.
return ([pattern], int(port))
# Now we're left with just the pattern, which results in a list of one
# or more hostnames, depending on whether it contains any [x:y] ranges.
if detect_range(pattern):
hostnames = expand_hostname_range(pattern)
else:
hostnames = [pattern]
return (hostnames, port)
def _Hosts(self, hostnames, port):
'''
Takes a list of hostnames and a port (which may be None) and returns a
list of Hosts (without recreating anything in self.hosts).
'''
hosts = []
# Note that we decide whether or not to create a Host based solely on
# the (non-)existence of its hostname in self.hosts. This means that one
# cannot add both "foo:22" and "foo:23" to the inventory. This behaviour
# is preserved for now, but this may be an easy FIXME.
for hn in hostnames:
if hn not in self.hosts:
self.hosts[hn] = Host(name=hn, port=port)
hosts.append(self.hosts[hn])
return hosts
@staticmethod
def _parse_value(v):
'''
Does something with something and returns something. Not for mere
mortals such as myself to interpret.
'''
if "#" not in v:
try:
v = ast.literal_eval(v)
@ -68,158 +328,62 @@ class InventoryParser(object):
pass
return to_unicode(v, nonstring='passthru', errors='strict')
# [webservers]
# alpha
# beta:2345
# gamma sudo=True user=root
# delta asdf=jkl favcolor=red
def _add_allgroup_children(self):
for group in self.groups.values():
if group.depth == 0 and group.name != 'all':
self.groups['all'].add_child_group(group)
def _parse_base_groups(self):
# FIXME: refactor
ungrouped = Group(name='ungrouped')
all = Group(name='all')
all.add_child_group(ungrouped)
self.groups = dict(all=all, ungrouped=ungrouped)
active_group_name = 'ungrouped'
i = 0
for line in self.lines:
i += 1
line = self._before_comment(line).strip()
if line.startswith("[") and line.endswith("]"):
active_group_name = line.replace("[","").replace("]","")
if ":vars" in line or ":children" in line:
active_group_name = active_group_name.rsplit(":", 1)[0]
if active_group_name not in self.groups:
new_group = self.groups[active_group_name] = Group(name=active_group_name)
active_group_name = None
elif active_group_name not in self.groups:
new_group = self.groups[active_group_name] = Group(name=active_group_name)
elif line.startswith(";") or line == '':
pass
elif active_group_name:
try:
tokens = shlex.split(line)
except ValueError as e:
raise AnsibleError("Error in %s, unable to parse L#%d: %s\n\n\t%s\n" % (self.filename, i, str(e), line))
if len(tokens) == 0:
continue
hostname = tokens[0]
port = None
# Three cases to check:
# 0. A hostname that contains a range pesudo-code and a port
# 1. A hostname that contains just a port
if hostname.count(":") > 1:
# Possible an IPv6 address, or maybe a host line with multiple ranges
# IPv6 with Port XXX:XXX::XXX.port
# FQDN foo.example.com
if hostname.count(".") == 1:
(hostname, port) = hostname.rsplit(".", 1)
elif ("[" in hostname and
"]" in hostname and
":" in hostname and
(hostname.rindex("]") < hostname.rindex(":")) or
("]" not in hostname and ":" in hostname)):
(hostname, port) = hostname.rsplit(":", 1)
hostnames = []
if detect_range(hostname):
hostnames = expand_hostname_range(hostname)
else:
hostnames = [hostname]
for hn in hostnames:
host = None
if hn in self.hosts:
host = self.hosts[hn]
else:
host = Host(name=hn, port=port)
self.hosts[hn] = host
if len(tokens) > 1:
for t in tokens[1:]:
if t.startswith('#'):
break
try:
(k,v) = t.split("=", 1)
except ValueError, e:
raise AnsibleError("Invalid ini entry in %s: %s - %s" % (self.filename, t, str(e)))
v = self._parse_value(v)
if k == 'ansible_ssh_host':
host.ipv4_address = v
host.set_variable(k, v)
self.groups[active_group_name].add_host(host)
# [southeast:children]
# atlanta
# raleigh
def _parse_group_children(self):
group = None
for line in self.lines:
line = line.strip()
if line is None or line == '':
continue
if line.startswith("[") and ":children]" in line:
line = line.replace("[","").replace(":children]","")
group = self.groups.get(line, None)
if group is None:
group = self.groups[line] = Group(name=line)
elif line.startswith("#") or line.startswith(";"):
pass
elif line.startswith("["):
group = None
elif group:
kid_group = self.groups.get(line, None)
if kid_group is None:
raise AnsibleError("child group is not defined: (%s)" % line)
else:
group.add_child_group(kid_group)
# [webservers:vars]
# http_port=1234
# maxRequestsPerChild=200
def _parse_group_variables(self):
group = None
for line in self.lines:
line = line.strip()
if line.startswith("[") and ":vars]" in line:
line = line.replace("[","").replace(":vars]","")
group = self.groups.get(line, None)
if group is None:
raise AnsibleError("can't add vars to undefined group: %s" % line)
elif line.startswith("#") or line.startswith(";"):
pass
elif line.startswith("["):
group = None
elif line == '':
pass
elif group:
if "=" not in line:
raise AnsibleError("variables assigned to group must be in key=value form")
else:
(k, v) = [e.strip() for e in line.split("=", 1)]
group.set_variable(k, self._parse_value(v))
def get_host_variables(self, host):
return {}
def _before_comment(self, msg):
''' what's the part of a string before a comment? '''
msg = msg.replace("\#","**NOT_A_COMMENT**")
msg = msg.split("#")[0]
msg = msg.replace("**NOT_A_COMMENT**","#")
return msg
def _compile_patterns(self):
'''
Compiles the regular expressions required to parse the inventory and
stores them in self.patterns.
'''
# Section names are square-bracketed expressions at the beginning of a
# line, comprising (1) a group name optionally followed by (2) a tag
# that specifies the contents of the section. We ignore any trailing
# whitespace and/or comments. For example:
#
# [groupname]
# [somegroup:vars]
# [naughty:children] # only get coal in their stockings
self.patterns['section'] = re.compile(
r'''^\[
([^:\]\s]+) # group name (see groupname below)
(?::(\w+))? # optional : and tag name
\]
\s* # ignore trailing whitespace
(?:\#.*)? # and/or a comment till the
$ # end of the line
''', re.X
)
# FIXME: What are the real restrictions on group names, or rather, what
# should they be? At the moment, they must be non-empty sequences of non
# whitespace characters excluding ':' and ']', but we should define more
# precise rules in order to support better diagnostics. The same applies
# to hostnames.
self.patterns['groupname'] = re.compile(
r'''^
([^:\]\s]+)
\s* # ignore trailing whitespace
(?:\#.*)? # and/or a comment till the
$ # end of the line
''', re.X
)
# This matches an IPv6 address, a '.', and a port number. It's not yet
# very strict about matching the IPv6 address.
#
# FIXME: There are various shortcomings in the IPv6 handling in the
# old code, which aren't fixed here yet. For example, Inventory's
# parse_inventory() method seems to accept "[ipv6]:nn" syntax. We
# should pick one and stick with it.
self.patterns['ipv6_hostport'] = re.compile(
r'''^
([a-fA-F0-9:]+)
\.([0-9]+)
$
''', re.X
)