dbt-selly/dbt-env/lib/python3.8/site-packages/agate/table/join.py

#!/usr/bin/env python
# pylint: disable=W0212

from agate.rows import Row
from agate import utils


def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False, columns=None):
    """
    Create a new table by joining two table's on common values. This method
    implements most varieties of SQL join, in addition to some unique features.

    If :code:`left_key` and :code:`right_key` are both :code:`None` then this
    method will peform a "sequential join", which is to say it will join on row
    number. The :code:`inner` and :code:`full_outer` arguments will determine
    whether dangling left-hand and right-hand rows are included, respectively.

    If :code:`left_key` is specified, then a "left outer join" will be
    performed. This will combine columns from the :code:`right_table` anywhere
    that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from
    the left table will be included with the right-hand columns set to
    :code:`None`.

    If :code:`inner` is :code:`True` then an "inner join" will be performed.
    Unmatched rows from either table will be left out.

    If :code:`full_outer` is :code:`True` then a "full outer join" will be
    performed. Unmatched rows from both tables will be included, with the
    columns in the other table set to :code:`None`.

    In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key`
    will be used for both tables.

    If :code:`left_key` and :code:`right_key` are column names, the right-hand
    identifier column will not be included in the output table.

    If :code:`require_match` is :code:`True` unmatched rows will raise an
    exception. This is like an "inner join" except any row that doesn't have a
    match will raise an exception instead of being dropped. This is useful for
    enforcing expectations about datasets that should match.

    Column names from the right table which also exist in this table will
    be suffixed "2" in the new table.

    A subset of columns from the right-hand table can be included in the joined
    table using the :code:`columns` argument.

    :param right_table:
        The "right" table to join to.
    :param left_key:
        Either the name of a column from the this table to join on, the index
        of a column, a sequence of such column identifiers, a
        :class:`function` that takes a row and returns a value to join on, or
        :code:`None` in which case the tables will be joined on row number.
    :param right_key:
        Either the name of a column from :code:table` to join on, the index of
        a column, a sequence of such column identifiers, or a :class:`function`
        that takes a ow and returns a value to join on. If :code:`None` then
        :code:`left_key` will be used for both. If :code:`left_key` is
        :code:`None` then this value is ignored.
    :param inner:
        Perform a SQL-style "inner join" instead of a left outer join. Rows
        which have no match for :code:`left_key` will not be included in
        the output table.
    :param full_outer:
        Perform a SQL-style "full outer" join rather than a left or a right.
        May not be used in combination with :code:`inner`.
    :param require_match:
        If true, an exception will be raised if there is a left_key with no
        matching right_key.
    :param columns:
        A sequence of column names from :code:`right_table` to include in
        the final output table. Defaults to all columns not in
        :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`.
    :returns:
        A new :class:`.Table`.
    """
    if inner and full_outer:
        raise ValueError('A join can not be both "inner" and "full_outer".')

    if right_key is None:
        right_key = left_key

    # Get join columns
    right_key_indices = []

    left_key_is_func = hasattr(left_key, '__call__')
    left_key_is_sequence = utils.issequence(left_key)

    # Left key is None
    if left_key is None:
        left_data = tuple(range(len(self._rows)))
    # Left key is a function
    elif left_key_is_func:
        left_data = [left_key(row) for row in self._rows]
    # Left key is a sequence
    elif left_key_is_sequence:
        left_columns = [self._columns[key] for key in left_key]
        left_data = zip(*[column.values() for column in left_columns])
    # Left key is a column name/index
    else:
        left_data = self._columns[left_key].values()

    right_key_is_func = hasattr(right_key, '__call__')
    right_key_is_sequence = utils.issequence(right_key)

    # Sequential join
    if left_key is None:
        right_data = tuple(range(len(right_table._rows)))
    # Right key is a function
    elif right_key_is_func:
        right_data = [right_key(row) for row in right_table._rows]
    # Right key is a sequence
    elif right_key_is_sequence:
        right_columns = [right_table._columns[key] for key in right_key]
        right_data = zip(*[column.values() for column in right_columns])
        right_key_indices = [right_table._columns._keys.index(key) for key in right_key]
    # Right key is a column name/index
    else:
        right_column = right_table._columns[right_key]
        right_data = right_column.values()
        right_key_indices = [right_table._columns.index(right_column)]

    # Build names and type lists
    column_names = list(self._column_names)
    column_types = list(self._column_types)

    for i, column in enumerate(right_table._columns):
        name = column.name

        if not full_outer:
            if columns is None and i in right_key_indices:
                continue

            if columns is not None and name not in columns:
                continue

        if name in self.column_names:
            column_names.append('%s2' % name)
        else:
            column_names.append(name)

        column_types.append(column.data_type)

    if columns is not None and not full_outer:
        right_table = right_table.select([n for n in right_table._column_names if n in columns])

    right_hash = {}

    for i, value in enumerate(right_data):
        if value not in right_hash:
            right_hash[value] = []

        right_hash[value].append(right_table._rows[i])

    # Collect new rows
    rows = []

    if self._row_names is not None and not full_outer:
        row_names = []
    else:
        row_names = None

    # Iterate over left column
    for left_index, left_value in enumerate(left_data):
        matching_rows = right_hash.get(left_value, None)

        if require_match and matching_rows is None:
            raise ValueError('Left key "%s" does not have a matching right key.' % left_value)

        # Rows with matches
        if matching_rows:
            for right_row in matching_rows:
                new_row = list(self._rows[left_index])

                for k, v in enumerate(right_row):
                    if columns is None and k in right_key_indices and not full_outer:
                        continue

                    new_row.append(v)

                rows.append(Row(new_row, column_names))

                if self._row_names is not None and not full_outer:
                    row_names.append(self._row_names[left_index])
        # Rows without matches
        elif not inner:
            new_row = list(self._rows[left_index])

            for k, v in enumerate(right_table._column_names):
                if columns is None and k in right_key_indices and not full_outer:
                    continue

                new_row.append(None)

            rows.append(Row(new_row, column_names))

            if self._row_names is not None and not full_outer:
                row_names.append(self._row_names[left_index])

    # Full outer join
    if full_outer:
        left_set = set(left_data)

        for right_index, right_value in enumerate(right_data):
            if right_value in left_set:
                continue

            new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index])

            rows.append(Row(new_row, column_names))

    return self._fork(rows, column_names, column_types, row_names=row_names)