from collections import namedtuple from copy import deepcopy from typing import List, Iterable, Optional, Dict, Set, Tuple, Any import threading from dbt.logger import CACHE_LOGGER as logger from dbt.utils import lowercase import dbt.exceptions _ReferenceKey = namedtuple('_ReferenceKey', 'database schema identifier') def _make_key(relation) -> _ReferenceKey: """Make _ReferenceKeys with lowercase values for the cache so we don't have to keep track of quoting """ # databases and schemas can both be None return _ReferenceKey(lowercase(relation.database), lowercase(relation.schema), lowercase(relation.identifier)) def dot_separated(key: _ReferenceKey) -> str: """Return the key in dot-separated string form. :param _ReferenceKey key: The key to stringify. """ return '.'.join(map(str, key)) class _CachedRelation: """Nothing about _CachedRelation is guaranteed to be thread-safe! :attr str schema: The schema of this relation. :attr str identifier: The identifier of this relation. :attr Dict[_ReferenceKey, _CachedRelation] referenced_by: The relations that refer to this relation. :attr BaseRelation inner: The underlying dbt relation. """ def __init__(self, inner): self.referenced_by = {} self.inner = inner def __str__(self) -> str: return ( '_CachedRelation(database={}, schema={}, identifier={}, inner={})' ).format(self.database, self.schema, self.identifier, self.inner) @property def database(self) -> Optional[str]: return lowercase(self.inner.database) @property def schema(self) -> Optional[str]: return lowercase(self.inner.schema) @property def identifier(self) -> Optional[str]: return lowercase(self.inner.identifier) def __copy__(self): new = self.__class__(self.inner) new.__dict__.update(self.__dict__) return new def __deepcopy__(self, memo): new = self.__class__(self.inner.incorporate()) new.__dict__.update(self.__dict__) new.referenced_by = deepcopy(self.referenced_by, memo) def is_referenced_by(self, key): return key in self.referenced_by def key(self): """Get the _ReferenceKey that represents this relation :return _ReferenceKey: A key for this relation. """ return _make_key(self) def add_reference(self, referrer: '_CachedRelation'): """Add a reference from referrer to self, indicating that if this node were drop...cascaded, the referrer would be dropped as well. :param _CachedRelation referrer: The node that refers to this node. """ self.referenced_by[referrer.key()] = referrer def collect_consequences(self): """Recursively collect a set of _ReferenceKeys that would consequentially get dropped if this were dropped via "drop ... cascade". :return Set[_ReferenceKey]: All the relations that would be dropped """ consequences = {self.key()} for relation in self.referenced_by.values(): consequences.update(relation.collect_consequences()) return consequences def release_references(self, keys): """Non-recursively indicate that an iterable of _ReferenceKey no longer exist. Unknown keys are ignored. :param Iterable[_ReferenceKey] keys: The keys to drop. """ keys = set(self.referenced_by) & set(keys) for key in keys: self.referenced_by.pop(key) def rename(self, new_relation): """Rename this cached relation to new_relation. Note that this will change the output of key(), all refs must be updated! :param _CachedRelation new_relation: The new name to apply to the relation """ # Relations store this stuff inside their `path` dict. But they # also store a table_name, and usually use it in their .render(), # so we need to update that as well. It doesn't appear that # table_name is ever anything but the identifier (via .create()) self.inner = self.inner.incorporate( path={ 'database': new_relation.inner.database, 'schema': new_relation.inner.schema, 'identifier': new_relation.inner.identifier }, ) def rename_key(self, old_key, new_key): """Rename a reference that may or may not exist. Only handles the reference itself, so this is the other half of what `rename` does. If old_key is not in referenced_by, this is a no-op. :param _ReferenceKey old_key: The old key to be renamed. :param _ReferenceKey new_key: The new key to rename to. :raises InternalError: If the new key already exists. """ if new_key in self.referenced_by: dbt.exceptions.raise_cache_inconsistent( 'in rename of "{}" -> "{}", new name is in the cache already' .format(old_key, new_key) ) if old_key not in self.referenced_by: return value = self.referenced_by.pop(old_key) self.referenced_by[new_key] = value def dump_graph_entry(self): """Return a key/value pair representing this key and its referents. return List[str]: The dot-separated form of all referent keys. """ return [dot_separated(r) for r in self.referenced_by] def lazy_log(msg, func): if logger.disabled: return logger.debug(msg.format(func())) class RelationsCache: """A cache of the relations known to dbt. Keeps track of relationships declared between tables and handles renames/drops as a real database would. :attr Dict[_ReferenceKey, _CachedRelation] relations: The known relations. :attr threading.RLock lock: The lock around relations, held during updates. The adapters also hold this lock while filling the cache. :attr Set[str] schemas: The set of known/cached schemas, all lowercased. """ def __init__(self) -> None: self.relations: Dict[_ReferenceKey, _CachedRelation] = {} self.lock = threading.RLock() self.schemas: Set[Tuple[Optional[str], Optional[str]]] = set() def add_schema( self, database: Optional[str], schema: Optional[str], ) -> None: """Add a schema to the set of known schemas (case-insensitive) :param database: The database name to add. :param schema: The schema name to add. """ self.schemas.add((lowercase(database), lowercase(schema))) def drop_schema( self, database: Optional[str], schema: Optional[str], ) -> None: """Drop the given schema and remove it from the set of known schemas. Then remove all its contents (and their dependents, etc) as well. """ key = (lowercase(database), lowercase(schema)) if key not in self.schemas: return # avoid iterating over self.relations while removing things by # collecting the list first. with self.lock: to_remove = self._list_relations_in_schema(database, schema) self._remove_all(to_remove) # handle a drop_schema race by using discard() over remove() self.schemas.discard(key) def update_schemas(self, schemas: Iterable[Tuple[Optional[str], str]]): """Add multiple schemas to the set of known schemas (case-insensitive) :param schemas: An iterable of the schema names to add. """ self.schemas.update((lowercase(d), s.lower()) for (d, s) in schemas) def __contains__(self, schema_id: Tuple[Optional[str], str]): """A schema is 'in' the relations cache if it is in the set of cached schemas. :param schema_id: The db name and schema name to look up. """ db, schema = schema_id return (lowercase(db), schema.lower()) in self.schemas def dump_graph(self): """Dump a key-only representation of the schema to a dictionary. Every known relation is a key with a value of a list of keys it is referenced by. """ # we have to hold the lock for the entire dump, if other threads modify # self.relations or any cache entry's referenced_by during iteration # it's a runtime error! with self.lock: return { dot_separated(k): v.dump_graph_entry() for k, v in self.relations.items() } def _setdefault(self, relation: _CachedRelation): """Add a relation to the cache, or return it if it already exists. :param _CachedRelation relation: The relation to set or get. :return _CachedRelation: The relation stored under the given relation's key """ self.add_schema(relation.database, relation.schema) key = relation.key() return self.relations.setdefault(key, relation) def _add_link(self, referenced_key, dependent_key): """Add a link between two relations to the database. Both the old and new entries must alraedy exist in the database. :param _ReferenceKey referenced_key: The key identifying the referenced model (the one that if dropped will drop the dependent model). :param _ReferenceKey dependent_key: The key identifying the dependent model. :raises InternalError: If either entry does not exist. """ referenced = self.relations.get(referenced_key) if referenced is None: return if referenced is None: dbt.exceptions.raise_cache_inconsistent( 'in add_link, referenced link key {} not in cache!' .format(referenced_key) ) dependent = self.relations.get(dependent_key) if dependent is None: dbt.exceptions.raise_cache_inconsistent( 'in add_link, dependent link key {} not in cache!' .format(dependent_key) ) assert dependent is not None # we just raised! referenced.add_reference(dependent) def add_link(self, referenced, dependent): """Add a link between two relations to the database. If either relation does not exist, it will be added as an "external" relation. The dependent model refers _to_ the referenced model. So, given arguments of (jake_test, bar, jake_test, foo): both values are in the schema jake_test and foo is a view that refers to bar, so "drop bar cascade" will drop foo and all of foo's dependents. :param BaseRelation referenced: The referenced model. :param BaseRelation dependent: The dependent model. :raises InternalError: If either entry does not exist. """ ref_key = _make_key(referenced) if (ref_key.database, ref_key.schema) not in self: # if we have not cached the referenced schema at all, we must be # referring to a table outside our control. There's no need to make # a link - we will never drop the referenced relation during a run. logger.debug( '{dep!s} references {ref!s} but {ref.database}.{ref.schema} ' 'is not in the cache, skipping assumed external relation' .format(dep=dependent, ref=ref_key) ) return if ref_key not in self.relations: # Insert a dummy "external" relation. referenced = referenced.replace( type=referenced.External ) self.add(referenced) dep_key = _make_key(dependent) if dep_key not in self.relations: # Insert a dummy "external" relation. dependent = dependent.replace( type=referenced.External ) self.add(dependent) logger.debug( 'adding link, {!s} references {!s}'.format(dep_key, ref_key) ) with self.lock: self._add_link(ref_key, dep_key) def add(self, relation): """Add the relation inner to the cache, under the schema schema and identifier identifier :param BaseRelation relation: The underlying relation. """ cached = _CachedRelation(relation) logger.debug('Adding relation: {!s}'.format(cached)) lazy_log('before adding: {!s}', self.dump_graph) with self.lock: self._setdefault(cached) lazy_log('after adding: {!s}', self.dump_graph) def _remove_refs(self, keys): """Removes all references to all entries in keys. This does not cascade! :param Iterable[_ReferenceKey] keys: The keys to remove. """ # remove direct refs for key in keys: del self.relations[key] # then remove all entries from each child for cached in self.relations.values(): cached.release_references(keys) def _drop_cascade_relation(self, dropped): """Drop the given relation and cascade it appropriately to all dependent relations. :param _CachedRelation dropped: An existing _CachedRelation to drop. """ if dropped not in self.relations: logger.debug('dropped a nonexistent relationship: {!s}' .format(dropped)) return consequences = self.relations[dropped].collect_consequences() logger.debug( 'drop {} is cascading to {}'.format(dropped, consequences) ) self._remove_refs(consequences) def drop(self, relation): """Drop the named relation and cascade it appropriately to all dependent relations. Because dbt proactively does many `drop relation if exist ... cascade` that are noops, nonexistent relation drops cause a debug log and no other actions. :param str schema: The schema of the relation to drop. :param str identifier: The identifier of the relation to drop. """ dropped = _make_key(relation) logger.debug('Dropping relation: {!s}'.format(dropped)) with self.lock: self._drop_cascade_relation(dropped) def _rename_relation(self, old_key, new_relation): """Rename a relation named old_key to new_key, updating references. Return whether or not there was a key to rename. :param _ReferenceKey old_key: The existing key, to rename from. :param _CachedRelation new_key: The new relation, to rename to. """ # On the database level, a rename updates all values that were # previously referenced by old_name to be referenced by new_name. # basically, the name changes but some underlying ID moves. Kind of # like an object reference! relation = self.relations.pop(old_key) new_key = new_relation.key() # relaton has to rename its innards, so it needs the _CachedRelation. relation.rename(new_relation) # update all the relations that refer to it for cached in self.relations.values(): if cached.is_referenced_by(old_key): logger.debug( 'updated reference from {0} -> {2} to {1} -> {2}' .format(old_key, new_key, cached.key()) ) cached.rename_key(old_key, new_key) self.relations[new_key] = relation # also fixup the schemas! self.add_schema(new_key.database, new_key.schema) return True def _check_rename_constraints(self, old_key, new_key): """Check the rename constraints, and return whether or not the rename can proceed. If the new key is already present, that is an error. If the old key is absent, we debug log and return False, assuming it's a temp table being renamed. :param _ReferenceKey old_key: The existing key, to rename from. :param _ReferenceKey new_key: The new key, to rename to. :return bool: If the old relation exists for renaming. :raises InternalError: If the new key is already present. """ if new_key in self.relations: dbt.exceptions.raise_cache_inconsistent( 'in rename, new key {} already in cache: {}' .format(new_key, list(self.relations.keys())) ) if old_key not in self.relations: logger.debug( 'old key {} not found in self.relations, assuming temporary' .format(old_key) ) return False return True def rename(self, old, new): """Rename the old schema/identifier to the new schema/identifier and update references. If the new schema/identifier is already present, that is an error. If the schema/identifier key is absent, we only debug log and return, assuming it's a temp table being renamed. :param BaseRelation old: The existing relation name information. :param BaseRelation new: The new relation name information. :raises InternalError: If the new key is already present. """ old_key = _make_key(old) new_key = _make_key(new) logger.debug('Renaming relation {!s} to {!s}'.format( old_key, new_key )) lazy_log('before rename: {!s}', self.dump_graph) with self.lock: if self._check_rename_constraints(old_key, new_key): self._rename_relation(old_key, _CachedRelation(new)) else: self._setdefault(_CachedRelation(new)) lazy_log('after rename: {!s}', self.dump_graph) def get_relations( self, database: Optional[str], schema: Optional[str] ) -> List[Any]: """Case-insensitively yield all relations matching the given schema. :param str schema: The case-insensitive schema name to list from. :return List[BaseRelation]: The list of relations with the given schema """ database = lowercase(database) schema = lowercase(schema) with self.lock: results = [ r.inner for r in self.relations.values() if (lowercase(r.schema) == schema and lowercase(r.database) == database) ] if None in results: dbt.exceptions.raise_cache_inconsistent( 'in get_relations, a None relation was found in the cache!' ) return results def clear(self): """Clear the cache""" with self.lock: self.relations.clear() self.schemas.clear() def _list_relations_in_schema( self, database: Optional[str], schema: Optional[str] ) -> List[_CachedRelation]: """Get the relations in a schema. Callers should hold the lock.""" key = (lowercase(database), lowercase(schema)) to_remove: List[_CachedRelation] = [] for cachekey, relation in self.relations.items(): if (cachekey.database, cachekey.schema) == key: to_remove.append(relation) return to_remove def _remove_all(self, to_remove: List[_CachedRelation]): """Remove all the listed relations. Ignore relations that have been cascaded out. """ for relation in to_remove: # it may have been cascaded out already drop_key = _make_key(relation) if drop_key in self.relations: self.drop(drop_key)