375 lines
14 KiB
Python
375 lines
14 KiB
Python
"""Functions for generating trees."""
|
|
from collections import defaultdict
|
|
|
|
import networkx as nx
|
|
from networkx.utils import py_random_state
|
|
|
|
__all__ = ["prefix_tree", "random_tree", "prefix_tree_recursive"]
|
|
|
|
|
|
def prefix_tree(paths):
|
|
"""Creates a directed prefix tree from a list of paths.
|
|
|
|
Usually the paths are described as strings or lists of integers.
|
|
|
|
A "prefix tree" represents the prefix structure of the strings.
|
|
Each node represents a prefix of some string. The root represents
|
|
the empty prefix with children for the single letter prefixes which
|
|
in turn have children for each double letter prefix starting with
|
|
the single letter corresponding to the parent node, and so on.
|
|
|
|
More generally the prefixes do not need to be strings. A prefix refers
|
|
to the start of a sequence. The root has children for each one element
|
|
prefix and they have children for each two element prefix that starts
|
|
with the one element sequence of the parent, and so on.
|
|
|
|
Note that this implementation uses integer nodes with an attribute.
|
|
Each node has an attribute "source" whose value is the original element
|
|
of the path to which this node corresponds. For example, suppose `paths`
|
|
consists of one path: "can". Then the nodes `[1, 2, 3]` which represent
|
|
this path have "source" values "c", "a" and "n".
|
|
|
|
All the descendants of a node have a common prefix in the sequence/path
|
|
associated with that node. From the returned tree, the prefix for each
|
|
node can be constructed by traversing the tree up to the root and
|
|
accumulating the "source" values along the way.
|
|
|
|
The root node is always `0` and has "source" attribute `None`.
|
|
The root is the only node with in-degree zero.
|
|
The nil node is always `-1` and has "source" attribute `"NIL"`.
|
|
The nil node is the only node with out-degree zero.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
paths: iterable of paths
|
|
An iterable of paths which are themselves sequences.
|
|
Matching prefixes among these sequences are identified with
|
|
nodes of the prefix tree. One leaf of the tree is associated
|
|
with each path. (Identical paths are associated with the same
|
|
leaf of the tree.)
|
|
|
|
|
|
Returns
|
|
-------
|
|
tree: DiGraph
|
|
A directed graph representing an arborescence consisting of the
|
|
prefix tree generated by `paths`. Nodes are directed "downward",
|
|
from parent to child. A special "synthetic" root node is added
|
|
to be the parent of the first node in each path. A special
|
|
"synthetic" leaf node, the "nil" node `-1`, is added to be the child
|
|
of all nodes representing the last element in a path. (The
|
|
addition of this nil node technically makes this not an
|
|
arborescence but a directed acyclic graph; removing the nil node
|
|
makes it an arborescence.)
|
|
|
|
|
|
Notes
|
|
-----
|
|
The prefix tree is also known as a *trie*.
|
|
|
|
|
|
Examples
|
|
--------
|
|
Create a prefix tree from a list of strings with common prefixes::
|
|
|
|
>>> paths = ["ab", "abs", "ad"]
|
|
>>> T = nx.prefix_tree(paths)
|
|
>>> list(T.edges)
|
|
[(0, 1), (1, 2), (1, 4), (2, -1), (2, 3), (3, -1), (4, -1)]
|
|
|
|
The leaf nodes can be obtained as predecessors of the nil node::
|
|
|
|
>>> root, NIL = 0, -1
|
|
>>> list(T.predecessors(NIL))
|
|
[2, 3, 4]
|
|
|
|
To recover the original paths that generated the prefix tree,
|
|
traverse up the tree from the node `-1` to the node `0`::
|
|
|
|
>>> recovered = []
|
|
>>> for v in T.predecessors(NIL):
|
|
... prefix = ""
|
|
... while v != root:
|
|
... prefix = str(T.nodes[v]["source"]) + prefix
|
|
... v = next(T.predecessors(v)) # only one predecessor
|
|
... recovered.append(prefix)
|
|
>>> sorted(recovered)
|
|
['ab', 'abs', 'ad']
|
|
"""
|
|
|
|
def get_children(parent, paths):
|
|
children = defaultdict(list)
|
|
# Populate dictionary with key(s) as the child/children of the root and
|
|
# value(s) as the remaining paths of the corresponding child/children
|
|
for path in paths:
|
|
# If path is empty, we add an edge to the NIL node.
|
|
if not path:
|
|
tree.add_edge(parent, NIL)
|
|
continue
|
|
child, *rest = path
|
|
# `child` may exist as the head of more than one path in `paths`.
|
|
children[child].append(rest)
|
|
return children
|
|
|
|
# Initialize the prefix tree with a root node and a nil node.
|
|
tree = nx.DiGraph()
|
|
root = 0
|
|
tree.add_node(root, source=None)
|
|
NIL = -1
|
|
tree.add_node(NIL, source="NIL")
|
|
children = get_children(root, paths)
|
|
stack = [(root, iter(children.items()))]
|
|
while stack:
|
|
parent, remaining_children = stack[-1]
|
|
try:
|
|
child, remaining_paths = next(remaining_children)
|
|
# Pop item off stack if there are no remaining children
|
|
except StopIteration:
|
|
stack.pop()
|
|
continue
|
|
# We relabel each child with an unused name.
|
|
new_name = len(tree) - 1
|
|
# The "source" node attribute stores the original node name.
|
|
tree.add_node(new_name, source=child)
|
|
tree.add_edge(parent, new_name)
|
|
children = get_children(new_name, remaining_paths)
|
|
stack.append((new_name, iter(children.items())))
|
|
|
|
return tree
|
|
|
|
|
|
def prefix_tree_recursive(paths):
|
|
"""Recursively creates a directed prefix tree from a list of paths.
|
|
|
|
The original recursive version of prefix_tree for comparison. It is
|
|
the same algorithm but the recursion is unrolled onto a stack.
|
|
|
|
Usually the paths are described as strings or lists of integers.
|
|
|
|
A "prefix tree" represents the prefix structure of the strings.
|
|
Each node represents a prefix of some string. The root represents
|
|
the empty prefix with children for the single letter prefixes which
|
|
in turn have children for each double letter prefix starting with
|
|
the single letter corresponding to the parent node, and so on.
|
|
|
|
More generally the prefixes do not need to be strings. A prefix refers
|
|
to the start of a sequence. The root has children for each one element
|
|
prefix and they have children for each two element prefix that starts
|
|
with the one element sequence of the parent, and so on.
|
|
|
|
Note that this implementation uses integer nodes with an attribute.
|
|
Each node has an attribute "source" whose value is the original element
|
|
of the path to which this node corresponds. For example, suppose `paths`
|
|
consists of one path: "can". Then the nodes `[1, 2, 3]` which represent
|
|
this path have "source" values "c", "a" and "n".
|
|
|
|
All the descendants of a node have a common prefix in the sequence/path
|
|
associated with that node. From the returned tree, ehe prefix for each
|
|
node can be constructed by traversing the tree up to the root and
|
|
accumulating the "source" values along the way.
|
|
|
|
The root node is always `0` and has "source" attribute `None`.
|
|
The root is the only node with in-degree zero.
|
|
The nil node is always `-1` and has "source" attribute `"NIL"`.
|
|
The nil node is the only node with out-degree zero.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
paths: iterable of paths
|
|
An iterable of paths which are themselves sequences.
|
|
Matching prefixes among these sequences are identified with
|
|
nodes of the prefix tree. One leaf of the tree is associated
|
|
with each path. (Identical paths are associated with the same
|
|
leaf of the tree.)
|
|
|
|
|
|
Returns
|
|
-------
|
|
tree: DiGraph
|
|
A directed graph representing an arborescence consisting of the
|
|
prefix tree generated by `paths`. Nodes are directed "downward",
|
|
from parent to child. A special "synthetic" root node is added
|
|
to be the parent of the first node in each path. A special
|
|
"synthetic" leaf node, the "nil" node `-1`, is added to be the child
|
|
of all nodes representing the last element in a path. (The
|
|
addition of this nil node technically makes this not an
|
|
arborescence but a directed acyclic graph; removing the nil node
|
|
makes it an arborescence.)
|
|
|
|
|
|
Notes
|
|
-----
|
|
The prefix tree is also known as a *trie*.
|
|
|
|
|
|
Examples
|
|
--------
|
|
Create a prefix tree from a list of strings with common prefixes::
|
|
|
|
>>> paths = ["ab", "abs", "ad"]
|
|
>>> T = nx.prefix_tree(paths)
|
|
>>> list(T.edges)
|
|
[(0, 1), (1, 2), (1, 4), (2, -1), (2, 3), (3, -1), (4, -1)]
|
|
|
|
The leaf nodes can be obtained as predecessors of the nil node.
|
|
|
|
>>> root, NIL = 0, -1
|
|
>>> list(T.predecessors(NIL))
|
|
[2, 3, 4]
|
|
|
|
To recover the original paths that generated the prefix tree,
|
|
traverse up the tree from the node `-1` to the node `0`::
|
|
|
|
>>> recovered = []
|
|
>>> for v in T.predecessors(NIL):
|
|
... prefix = ""
|
|
... while v != root:
|
|
... prefix = str(T.nodes[v]["source"]) + prefix
|
|
... v = next(T.predecessors(v)) # only one predecessor
|
|
... recovered.append(prefix)
|
|
>>> sorted(recovered)
|
|
['ab', 'abs', 'ad']
|
|
"""
|
|
|
|
def _helper(paths, root, tree):
|
|
"""Recursively create a trie from the given list of paths.
|
|
|
|
`paths` is a list of paths, each of which is itself a list of
|
|
nodes, relative to the given `root` (but not including it). This
|
|
list of paths will be interpreted as a tree-like structure, in
|
|
which two paths that share a prefix represent two branches of
|
|
the tree with the same initial segment.
|
|
|
|
`root` is the parent of the node at index 0 in each path.
|
|
|
|
`tree` is the "accumulator", the :class:`networkx.DiGraph`
|
|
representing the branching to which the new nodes and edges will
|
|
be added.
|
|
|
|
"""
|
|
# For each path, remove the first node and make it a child of root.
|
|
# Any remaining paths then get processed recursively.
|
|
children = defaultdict(list)
|
|
for path in paths:
|
|
# If path is empty, we add an edge to the NIL node.
|
|
if not path:
|
|
tree.add_edge(root, NIL)
|
|
continue
|
|
child, *rest = path
|
|
# `child` may exist as the head of more than one path in `paths`.
|
|
children[child].append(rest)
|
|
# Add a node for each child, connect root, recurse to remaining paths
|
|
for child, remaining_paths in children.items():
|
|
# We relabel each child with an unused name.
|
|
new_name = len(tree) - 1
|
|
# The "source" node attribute stores the original node name.
|
|
tree.add_node(new_name, source=child)
|
|
tree.add_edge(root, new_name)
|
|
_helper(remaining_paths, new_name, tree)
|
|
|
|
# Initialize the prefix tree with a root node and a nil node.
|
|
tree = nx.DiGraph()
|
|
root = 0
|
|
tree.add_node(root, source=None)
|
|
NIL = -1
|
|
tree.add_node(NIL, source="NIL")
|
|
# Populate the tree.
|
|
_helper(paths, root, tree)
|
|
return tree
|
|
|
|
|
|
# From the Wikipedia article on Prüfer sequences:
|
|
#
|
|
# > Generating uniformly distributed random Prüfer sequences and
|
|
# > converting them into the corresponding trees is a straightforward
|
|
# > method of generating uniformly distributed random labelled trees.
|
|
#
|
|
@py_random_state(1)
|
|
def random_tree(n, seed=None, create_using=None):
|
|
"""Returns a uniformly random tree on `n` nodes.
|
|
|
|
Parameters
|
|
----------
|
|
n : int
|
|
A positive integer representing the number of nodes in the tree.
|
|
seed : integer, random_state, or None (default)
|
|
Indicator of random number generation state.
|
|
See :ref:`Randomness<randomness>`.
|
|
create_using : NetworkX graph constructor, optional (default=nx.Graph)
|
|
Graph type to create. If graph instance, then cleared before populated.
|
|
|
|
Returns
|
|
-------
|
|
NetworkX graph
|
|
A tree, given as an undirected graph, whose nodes are numbers in
|
|
the set {0, …, *n* - 1}.
|
|
|
|
Raises
|
|
------
|
|
NetworkXPointlessConcept
|
|
If `n` is zero (because the null graph is not a tree).
|
|
|
|
Notes
|
|
-----
|
|
The current implementation of this function generates a uniformly
|
|
random Prüfer sequence then converts that to a tree via the
|
|
:func:`~networkx.from_prufer_sequence` function. Since there is a
|
|
bijection between Prüfer sequences of length *n* - 2 and trees on
|
|
*n* nodes, the tree is chosen uniformly at random from the set of
|
|
all trees on *n* nodes.
|
|
|
|
Examples
|
|
--------
|
|
>>> tree = nx.random_tree(n=10, seed=0)
|
|
>>> print(nx.forest_str(tree, sources=[0]))
|
|
╙── 0
|
|
├── 3
|
|
└── 4
|
|
├── 6
|
|
│ ├── 1
|
|
│ ├── 2
|
|
│ └── 7
|
|
│ └── 8
|
|
│ └── 5
|
|
└── 9
|
|
|
|
>>> tree = nx.random_tree(n=10, seed=0, create_using=nx.DiGraph)
|
|
>>> print(nx.forest_str(tree))
|
|
╙── 0
|
|
├─╼ 3
|
|
└─╼ 4
|
|
├─╼ 6
|
|
│ ├─╼ 1
|
|
│ ├─╼ 2
|
|
│ └─╼ 7
|
|
│ └─╼ 8
|
|
│ └─╼ 5
|
|
└─╼ 9
|
|
"""
|
|
if n == 0:
|
|
raise nx.NetworkXPointlessConcept("the null graph is not a tree")
|
|
# Cannot create a Prüfer sequence unless `n` is at least two.
|
|
if n == 1:
|
|
utree = nx.empty_graph(1, create_using)
|
|
else:
|
|
sequence = [seed.choice(range(n)) for i in range(n - 2)]
|
|
utree = nx.from_prufer_sequence(sequence)
|
|
|
|
if create_using is None:
|
|
tree = utree
|
|
else:
|
|
tree = nx.empty_graph(0, create_using)
|
|
if tree.is_directed():
|
|
# Use a arbitrary root node and dfs to define edge directions
|
|
edges = nx.dfs_edges(utree, source=0)
|
|
else:
|
|
edges = utree.edges
|
|
|
|
# Populate the specified graph type
|
|
tree.add_nodes_from(utree.nodes)
|
|
tree.add_edges_from(edges)
|
|
|
|
return tree
|