229 lines
7.3 KiB
Python
229 lines
7.3 KiB
Python
from codecs import BOM_UTF8
|
|
|
|
import agate
|
|
import datetime
|
|
import isodate
|
|
import json
|
|
import dbt.utils
|
|
from typing import Iterable, List, Dict, Union, Optional, Any
|
|
|
|
from dbt.exceptions import RuntimeException
|
|
|
|
|
|
BOM = BOM_UTF8.decode('utf-8') # '\ufeff'
|
|
|
|
|
|
class ISODateTime(agate.data_types.DateTime):
|
|
def cast(self, d):
|
|
# this is agate.data_types.DateTime.cast with the "clever" bits removed
|
|
# so we only handle ISO8601 stuff
|
|
if isinstance(d, datetime.datetime) or d is None:
|
|
return d
|
|
elif isinstance(d, datetime.date):
|
|
return datetime.datetime.combine(d, datetime.time(0, 0, 0))
|
|
elif isinstance(d, str):
|
|
d = d.strip()
|
|
if d.lower() in self.null_values:
|
|
return None
|
|
try:
|
|
return isodate.parse_datetime(d)
|
|
except: # noqa
|
|
pass
|
|
|
|
raise agate.exceptions.CastError(
|
|
'Can not parse value "%s" as datetime.' % d
|
|
)
|
|
|
|
|
|
def build_type_tester(
|
|
text_columns: Iterable[str],
|
|
string_null_values: Optional[Iterable[str]] = ('null', '')
|
|
) -> agate.TypeTester:
|
|
|
|
types = [
|
|
agate.data_types.Number(null_values=('null', '')),
|
|
agate.data_types.Date(null_values=('null', ''),
|
|
date_format='%Y-%m-%d'),
|
|
agate.data_types.DateTime(null_values=('null', ''),
|
|
datetime_format='%Y-%m-%d %H:%M:%S'),
|
|
ISODateTime(null_values=('null', '')),
|
|
agate.data_types.Boolean(true_values=('true',),
|
|
false_values=('false',),
|
|
null_values=('null', '')),
|
|
agate.data_types.Text(null_values=string_null_values)
|
|
]
|
|
force = {
|
|
k: agate.data_types.Text(null_values=string_null_values)
|
|
for k in text_columns
|
|
}
|
|
return agate.TypeTester(force=force, types=types)
|
|
|
|
|
|
DEFAULT_TYPE_TESTER = build_type_tester(())
|
|
|
|
|
|
def table_from_rows(
|
|
rows: List[Any],
|
|
column_names: Iterable[str],
|
|
text_only_columns: Optional[Iterable[str]] = None,
|
|
) -> agate.Table:
|
|
if text_only_columns is None:
|
|
column_types = DEFAULT_TYPE_TESTER
|
|
else:
|
|
# If text_only_columns are present, prevent coercing empty string or
|
|
# literal 'null' strings to a None representation.
|
|
column_types = build_type_tester(
|
|
text_only_columns,
|
|
string_null_values=()
|
|
)
|
|
|
|
return agate.Table(rows, column_names, column_types=column_types)
|
|
|
|
|
|
def table_from_data(data, column_names: Iterable[str]) -> agate.Table:
|
|
"Convert list of dictionaries into an Agate table"
|
|
|
|
# The agate table is generated from a list of dicts, so the column order
|
|
# from `data` is not preserved. We can use `select` to reorder the columns
|
|
#
|
|
# If there is no data, create an empty table with the specified columns
|
|
|
|
if len(data) == 0:
|
|
return agate.Table([], column_names=column_names)
|
|
else:
|
|
table = agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER)
|
|
return table.select(column_names)
|
|
|
|
|
|
def table_from_data_flat(data, column_names: Iterable[str]) -> agate.Table:
|
|
"""
|
|
Convert a list of dictionaries into an Agate table. This method does not
|
|
coerce string values into more specific types (eg. '005' will not be
|
|
coerced to '5'). Additionally, this method does not coerce values to
|
|
None (eg. '' or 'null' will retain their string literal representations).
|
|
"""
|
|
|
|
rows = []
|
|
text_only_columns = set()
|
|
for _row in data:
|
|
row = []
|
|
for col_name in column_names:
|
|
value = _row[col_name]
|
|
if isinstance(value, (dict, list, tuple)):
|
|
# Represent container types as json strings
|
|
value = json.dumps(value, cls=dbt.utils.JSONEncoder)
|
|
text_only_columns.add(col_name)
|
|
elif isinstance(value, str):
|
|
text_only_columns.add(col_name)
|
|
row.append(value)
|
|
|
|
rows.append(row)
|
|
|
|
return table_from_rows(
|
|
rows=rows,
|
|
column_names=column_names,
|
|
text_only_columns=text_only_columns
|
|
)
|
|
|
|
|
|
def empty_table():
|
|
"Returns an empty Agate table. To be used in place of None"
|
|
|
|
return agate.Table(rows=[])
|
|
|
|
|
|
def as_matrix(table):
|
|
"Return an agate table as a matrix of data sans columns"
|
|
|
|
return [r.values() for r in table.rows.values()]
|
|
|
|
|
|
def from_csv(abspath, text_columns):
|
|
type_tester = build_type_tester(text_columns=text_columns)
|
|
with open(abspath, encoding='utf-8') as fp:
|
|
if fp.read(1) != BOM:
|
|
fp.seek(0)
|
|
return agate.Table.from_csv(fp, column_types=type_tester)
|
|
|
|
|
|
class _NullMarker:
|
|
pass
|
|
|
|
|
|
NullableAgateType = Union[agate.data_types.DataType, _NullMarker]
|
|
|
|
|
|
class ColumnTypeBuilder(Dict[str, NullableAgateType]):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def __setitem__(self, key, value):
|
|
if key not in self:
|
|
super().__setitem__(key, value)
|
|
return
|
|
|
|
existing_type = self[key]
|
|
if isinstance(existing_type, _NullMarker):
|
|
# overwrite
|
|
super().__setitem__(key, value)
|
|
elif isinstance(value, _NullMarker):
|
|
# use the existing value
|
|
return
|
|
elif not isinstance(value, type(existing_type)):
|
|
# actual type mismatch!
|
|
raise RuntimeException(
|
|
f'Tables contain columns with the same names ({key}), '
|
|
f'but different types ({value} vs {existing_type})'
|
|
)
|
|
|
|
def finalize(self) -> Dict[str, agate.data_types.DataType]:
|
|
result: Dict[str, agate.data_types.DataType] = {}
|
|
for key, value in self.items():
|
|
if isinstance(value, _NullMarker):
|
|
# this is what agate would do.
|
|
result[key] = agate.data_types.Number()
|
|
else:
|
|
result[key] = value
|
|
return result
|
|
|
|
|
|
def _merged_column_types(
|
|
tables: List[agate.Table]
|
|
) -> Dict[str, agate.data_types.DataType]:
|
|
# this is a lot like agate.Table.merge, but with handling for all-null
|
|
# rows being "any type".
|
|
new_columns: ColumnTypeBuilder = ColumnTypeBuilder()
|
|
for table in tables:
|
|
for i in range(len(table.columns)):
|
|
column_name: str = table.column_names[i]
|
|
column_type: NullableAgateType = table.column_types[i]
|
|
# avoid over-sensitive type inference
|
|
if all(x is None for x in table.columns[column_name]):
|
|
column_type = _NullMarker()
|
|
new_columns[column_name] = column_type
|
|
|
|
return new_columns.finalize()
|
|
|
|
|
|
def merge_tables(tables: List[agate.Table]) -> agate.Table:
|
|
"""This is similar to agate.Table.merge, but it handles rows of all 'null'
|
|
values more gracefully during merges.
|
|
"""
|
|
new_columns = _merged_column_types(tables)
|
|
column_names = tuple(new_columns.keys())
|
|
column_types = tuple(new_columns.values())
|
|
|
|
rows: List[agate.Row] = []
|
|
for table in tables:
|
|
if (
|
|
table.column_names == column_names and
|
|
table.column_types == column_types
|
|
):
|
|
rows.extend(table.rows)
|
|
else:
|
|
for row in table.rows:
|
|
data = [row.get(name, None) for name in column_names]
|
|
rows.append(agate.Row(data, column_names))
|
|
# _is_fork to tell agate that we already made things into `Row`s.
|
|
return agate.Table(rows, column_names, column_types, _is_fork=True)
|