dbt-selly/dbt-env/lib/python3.8/site-packages/agate/table/__init__.py

362 lines
13 KiB
Python

#!/usr/bin/env python
"""
The :class:`.Table` object is the most important class in agate. Tables are
created by supplying row data, column names and subclasses of :class:`.DataType`
to the constructor. Once created, the data in a table **can not be changed**.
This concept is central to agate.
Instead of modifying the data, various methods can be used to create new,
derivative tables. For example, the :meth:`.Table.select` method creates a new
table with only the specified columns. The :meth:`.Table.where` method creates
a new table with only those rows that pass a test. And :meth:`.Table.order_by`
creates a sorted table. In all of these cases the output is a new :class:`.Table`
and the existing table remains unmodified.
Tables are not themselves iterable, but the columns of the table can be
accessed via :attr:`.Table.columns` and the rows via :attr:`.Table.rows`. Both
sequences can be accessed either by numeric index or by name. (In the case of
rows, row names are optional.)
"""
from itertools import chain
import sys
import warnings
import six
from six.moves import range # pylint: disable=W0622
from agate.columns import Column
from agate.data_types import DataType
from agate.mapped_sequence import MappedSequence
from agate.rows import Row
from agate.type_tester import TypeTester
from agate import utils
from agate.exceptions import CastError
from agate.warns import warn_duplicate_column, warn_unnamed_column
@six.python_2_unicode_compatible
class Table(object):
"""
A dataset consisting of rows and columns. Columns refer to "vertical" slices
of data that must all be of the same type. Rows refer to "horizontal" slices
of data that may (and usually do) contain mixed types.
The sequence of :class:`.Column` instances are retrieved via the
:attr:`.Table.columns` property. They may be accessed by either numeric
index or by unique column name.
The sequence of :class:`.Row` instances are retrieved via the
:attr:`.Table.rows` property. They may be accessed by either numeric index
or, if specified, unique row names.
:param rows:
The data as a sequence of any sequences: tuples, lists, etc. If
any row has fewer values than the number of columns, it will be filled
out with nulls. No row may have more values than the number of columns.
:param column_names:
A sequence of string names for each column or `None`, in which case
column names will be automatically assigned using :func:`.letter_name`.
:param column_types:
A sequence of instances of :class:`.DataType` or an instance of
:class:`.TypeTester` or `None` in which case a generic TypeTester will
be used. Alternatively, a dictionary with column names as keys and
instances of :class:`.DataType` as values to specify some types.
:param row_names:
Specifies unique names for each row. This parameter is
optional. If specified it may be 1) the name of a single column that
contains a unique identifier for each row, 2) a key function that takes
a :class:`.Row` and returns a unique identifier or 3) a sequence of
unique identifiers of the same length as the sequence of rows. The
uniqueness of resulting identifiers is not validated, so be certain
the values you provide are truly unique.
:param _is_fork:
Used internally to skip certain validation steps when data
is propagated from an existing table. When :code:`True`, rows are
assumed to be :class:`.Row` instances, rather than raw data.
"""
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
if isinstance(rows, six.string_types):
raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?')
# Validate column names
if column_names:
self._column_names = utils.deduplicate(column_names, column_names=True)
elif rows:
self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2)
else:
self._column_names = tuple()
len_column_names = len(self._column_names)
# Validate column_types
if column_types is None:
column_types = TypeTester()
elif isinstance(column_types, dict):
for v in column_types.values():
if not isinstance(v, DataType):
raise ValueError('Column types must be instances of DataType.')
column_types = TypeTester(force=column_types)
elif not isinstance(column_types, TypeTester):
for column_type in column_types:
if not isinstance(column_type, DataType):
raise ValueError('Column types must be instances of DataType.')
if isinstance(column_types, TypeTester):
self._column_types = column_types.run(rows, self._column_names)
else:
self._column_types = tuple(column_types)
if len_column_names != len(self._column_types):
raise ValueError('column_names and column_types must be the same length.')
if not _is_fork:
new_rows = []
cast_funcs = [c.cast for c in self._column_types]
for i, row in enumerate(rows):
len_row = len(row)
if len_row > len_column_names:
raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names))
elif len(row) < len_column_names:
row = chain(row, [None] * (len_column_names - len_row))
row_values = []
for j, d in enumerate(row):
try:
row_values.append(cast_funcs[j](d))
except CastError as e:
raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j]))
new_rows.append(Row(row_values, self._column_names))
else:
new_rows = rows
if row_names:
computed_row_names = []
if isinstance(row_names, six.string_types):
for row in new_rows:
name = row[row_names]
computed_row_names.append(name)
elif hasattr(row_names, '__call__'):
for row in new_rows:
name = row_names(row)
computed_row_names.append(name)
elif utils.issequence(row_names):
computed_row_names = row_names
else:
raise ValueError('row_names must be a column name, function or sequence')
for row_name in computed_row_names:
if type(row_name) is int:
raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.')
self._row_names = tuple(computed_row_names)
else:
self._row_names = None
self._rows = MappedSequence(new_rows, self._row_names)
# Build columns
new_columns = []
for i in range(len_column_names):
name = self._column_names[i]
data_type = self._column_types[i]
column = Column(i, name, data_type, self._rows, row_names=self._row_names)
new_columns.append(column)
self._columns = MappedSequence(new_columns, self._column_names)
def __str__(self):
"""
Print the table's structure using :meth:`.Table.print_structure`.
"""
structure = six.StringIO()
self.print_structure(output=structure)
return structure.getvalue()
def __len__(self):
"""
Shorthand for :code:`len(table.rows)`.
"""
return self._rows.__len__()
def __iter__(self):
"""
Shorthand for :code:`iter(table.rows)`.
"""
return self._rows.__iter__()
def __getitem__(self, key):
"""
Shorthand for :code:`table.rows[foo]`.
"""
return self._rows.__getitem__(key)
@property
def column_types(self):
"""
An tuple :class:`.DataType` instances.
"""
return self._column_types
@property
def column_names(self):
"""
An tuple of strings.
"""
return self._column_names
@property
def row_names(self):
"""
An tuple of strings, if this table has row names.
If this table does not have row names, then :code:`None`.
"""
return self._row_names
@property
def columns(self):
"""
A :class:`.MappedSequence` with column names for keys and
:class:`.Column` instances for values.
"""
return self._columns
@property
def rows(self):
"""
A :class:`.MappedSeqeuence` with row names for keys (if specified) and
:class:`.Row` instances for values.
"""
return self._rows
def _fork(self, rows, column_names=None, column_types=None, row_names=None):
"""
Create a new table using the metadata from this one.
This method is used internally by functions like
:meth:`.Table.order_by`.
:param rows:
Row data for the forked table.
:param column_names:
Column names for the forked table. If not specified, fork will use
this table's column names.
:param column_types:
Column types for the forked table. If not specified, fork will use
this table's column names.
:param row_names:
Row names for the forked table. If not specified, fork will use
this table's row names.
"""
if column_names is None:
column_names = self._column_names
if column_types is None:
column_types = self._column_types
if row_names is None:
row_names = self._row_names
return Table(rows, column_names, column_types, row_names=row_names, _is_fork=True)
def print_csv(self, **kwargs):
"""
Print this table as a CSV.
This is the same as passing :code:`sys.stdout` to :meth:`.Table.to_csv`.
:code:`kwargs` will be passed on to :meth:`.Table.to_csv`.
"""
self.to_csv(sys.stdout, **kwargs)
def print_json(self, **kwargs):
"""
Print this table as JSON.
This is the same as passing :code:`sys.stdout` to
:meth:`.Table.to_json`.
:code:`kwargs` will be passed on to :meth:`.Table.to_json`.
"""
self.to_json(sys.stdout, **kwargs)
from agate.table.aggregate import aggregate
from agate.table.bar_chart import bar_chart
from agate.table.bins import bins
from agate.table.column_chart import column_chart
from agate.table.compute import compute
from agate.table.denormalize import denormalize
from agate.table.distinct import distinct
from agate.table.exclude import exclude
from agate.table.find import find
from agate.table.from_csv import from_csv
from agate.table.from_fixed import from_fixed
from agate.table.from_json import from_json
from agate.table.from_object import from_object
from agate.table.group_by import group_by
from agate.table.homogenize import homogenize
from agate.table.join import join
from agate.table.limit import limit
from agate.table.line_chart import line_chart
from agate.table.merge import merge
from agate.table.normalize import normalize
from agate.table.order_by import order_by
from agate.table.pivot import pivot
from agate.table.print_bars import print_bars
from agate.table.print_html import print_html
from agate.table.print_structure import print_structure
from agate.table.print_table import print_table
from agate.table.rename import rename
from agate.table.scatterplot import scatterplot
from agate.table.select import select
from agate.table.to_csv import to_csv
from agate.table.to_json import to_json
from agate.table.where import where
Table.aggregate = aggregate
Table.bar_chart = bar_chart
Table.bins = bins
Table.column_chart = column_chart
Table.compute = compute
Table.denormalize = denormalize
Table.distinct = distinct
Table.exclude = exclude
Table.find = find
Table.from_csv = from_csv
Table.from_fixed = from_fixed
Table.from_json = from_json
Table.from_object = from_object
Table.group_by = group_by
Table.homogenize = homogenize
Table.join = join
Table.limit = limit
Table.line_chart = line_chart
Table.merge = merge
Table.normalize = normalize
Table.order_by = order_by
Table.pivot = pivot
Table.print_bars = print_bars
Table.print_html = print_html
Table.print_structure = print_structure
Table.print_table = print_table
Table.rename = rename
Table.scatterplot = scatterplot
Table.select = select
Table.to_csv = to_csv
Table.to_json = to_json
Table.where = where