from copy import deepcopy from dbt.context.context_config import ContextConfig from dbt.contracts.graph.parsed import ParsedModelNode import dbt.flags as flags from dbt.logger import GLOBAL_LOGGER as logger from dbt.node_types import NodeType from dbt.parser.base import SimpleSQLParser from dbt.parser.search import FileBlock import dbt.tracking as tracking from dbt import utils from dbt_extractor import ExtractionError, py_extract_from_source # type: ignore from functools import reduce from itertools import chain import random from typing import Any, Dict, Iterator, List, Optional, Tuple, Union # debug loglines are used for integration testing. If you change # the code at the beginning of the debug line, change the tests in # test/integration/072_experimental_parser_tests/test_all_experimental_parser.py class ModelParser(SimpleSQLParser[ParsedModelNode]): def parse_from_dict(self, dct, validate=True) -> ParsedModelNode: if validate: ParsedModelNode.validate(dct) return ParsedModelNode.from_dict(dct) @property def resource_type(self) -> NodeType: return NodeType.Model @classmethod def get_compiled_path(cls, block: FileBlock): return block.path.relative_path # TODO when this is turned on by default, simplify the nasty if/else tree inside this method. def render_update( self, node: ParsedModelNode, config: ContextConfig ) -> None: # TODO go back to 1/100 when this is turned on by default. # `True` roughly 1/50 times this function is called sample: bool = random.randint(1, 51) == 50 # top-level declaration of variables experimentally_parsed: Optional[Union[str, Dict[str, List[Any]]]] = None config_call_dict: Dict[str, Any] = {} source_calls: List[List[str]] = [] result: List[str] = [] # run the experimental parser if the flag is on or if we're sampling if flags.USE_EXPERIMENTAL_PARSER or sample: if self._has_banned_macro(node): experimentally_parsed = "has_banned_macro" else: # run the experimental parser and return the results try: experimentally_parsed = py_extract_from_source( node.raw_sql ) logger.debug(f"1699: statically parsed {node.path}") # if we want information on what features are barring the experimental # parser from reading model files, this is where we would add that # since that information is stored in the `ExtractionError`. except ExtractionError: experimentally_parsed = "cannot_parse" # if the parser succeeded, extract some data in easy-to-compare formats if isinstance(experimentally_parsed, dict): # create second config format for c in experimentally_parsed['configs']: ContextConfig._add_config_call(config_call_dict, {c[0]: c[1]}) # format sources TODO change extractor to match this type for s in experimentally_parsed['sources']: source_calls.append([s[0], s[1]]) experimentally_parsed['sources'] = source_calls # if we're sampling during a normal dbt run, populate an entirely new node to compare if not flags.USE_EXPERIMENTAL_PARSER: if sample and isinstance(experimentally_parsed, dict): # if this will _never_ mutate anything `self` we could avoid these deep copies, # but we can't really guarantee that going forward. model_parser_copy = self.partial_deepcopy() exp_sample_node = deepcopy(node) exp_sample_config = deepcopy(config) model_parser_copy.populate( exp_sample_node, exp_sample_config, experimentally_parsed ) super().render_update(node, config) # if the --use-experimental-parser flag was set, and the experimental parser succeeded elif isinstance(experimentally_parsed, dict): # update the unrendered config with values from the static parser. # values from yaml files are in there already self.populate( node, config, experimentally_parsed ) self.manifest._parsing_info.static_analysis_parsed_path_count += 1 # the experimental parser didn't run on this model. # fall back to python jinja rendering. elif isinstance(experimentally_parsed, str): if experimentally_parsed == "cannot_parse": result += ["01_stable_parser_cannot_parse"] logger.debug( f"1602: parser fallback to jinja for {node.path}" ) elif experimentally_parsed == "has_banned_macro": result += ["08_has_banned_macro"] logger.debug( f"1601: parser fallback to jinja because of macro override for {node.path}" ) super().render_update(node, config) # otherwise jinja rendering. else: super().render_update(node, config) if sample and isinstance(experimentally_parsed, dict): # now that the sample succeeded, is populated and the current # values are rendered, compare the two and collect the tracking messages result += _get_exp_sample_result( exp_sample_node, exp_sample_config, node, config, ) # fire a tracking event. this fires one event for every sample # so that we have data on a per file basis. Not only can we expect # no false positives or misses, we can expect the number model # files parseable by the experimental parser to match our internal # testing. if result and tracking.active_user is not None: # None in some tests tracking.track_experimental_parser_sample({ "project_id": self.root_project.hashed_name(), "file_id": utils.get_hash(node), "status": result }) # checks for banned macros def _has_banned_macro( self, node: ParsedModelNode ) -> bool: # first check if there is a banned macro defined in scope for this model file root_project_name = self.root_project.project_name project_name = node.package_name banned_macros = ['ref', 'source', 'config'] all_banned_macro_keys: Iterator[str] = chain.from_iterable( map( lambda name: [ f"macro.{project_name}.{name}", f"macro.{root_project_name}.{name}" ], banned_macros ) ) return reduce( lambda z, key: z or (key in self.manifest.macros), all_banned_macro_keys, False ) # this method updates the model note rendered and unrendered config as well # as the node object. Used to populate these values when circumventing jinja # rendering like the static parser. def populate( self, node: ParsedModelNode, config: ContextConfig, experimentally_parsed: Dict[str, Any] ): # manually fit configs in config._config_call_dict = _get_config_call_dict(experimentally_parsed) # if there are hooks present this, it WILL render jinja. Will need to change # when the experimental parser supports hooks self.update_parsed_node_config(node, config) # update the unrendered config with values from the file. # values from yaml files are in there already node.unrendered_config.update(experimentally_parsed['configs']) # set refs and sources on the node object node.refs += experimentally_parsed['refs'] node.sources += experimentally_parsed['sources'] # configs don't need to be merged into the node because they # are read from config._config_call_dict # the manifest is often huge so this method avoids deepcopying it def partial_deepcopy(self): return ModelParser( deepcopy(self.project), self.manifest, deepcopy(self.root_project) ) # pure function. safe to use elsewhere, but unlikely to be useful outside this file. def _get_config_call_dict( static_parser_result: Dict[str, List[Any]] ) -> Dict[str, Any]: config_call_dict: Dict[str, Any] = {} for c in static_parser_result['configs']: ContextConfig._add_config_call(config_call_dict, {c[0]: c[1]}) return config_call_dict # returns a list of string codes to be sent as a tracking event def _get_exp_sample_result( sample_node: ParsedModelNode, sample_config: ContextConfig, node: ParsedModelNode, config: ContextConfig ) -> List[str]: result: List[Tuple[int, str]] = _get_sample_result(sample_node, sample_config, node, config) def process(codemsg): code, msg = codemsg return f"0{code}_experimental_{msg}" return list(map(process, result)) # returns a list of messages and int codes and messages that need a single digit # prefix to be prepended before being sent as a tracking event def _get_sample_result( sample_node: ParsedModelNode, sample_config: ContextConfig, node: ParsedModelNode, config: ContextConfig ) -> List[Tuple[int, str]]: result: List[Tuple[int, str]] = [] # look for false positive configs for k in sample_config._config_call_dict: if k not in config._config_call_dict: result += [(2, "false_positive_config_value")] break # look for missed configs for k in config._config_call_dict.keys(): if k not in sample_config._config_call_dict.keys(): result += [(3, "missed_config_value")] break # look for false positive sources for s in sample_node.sources: if s not in node.sources: result += [(4, "false_positive_source_value")] break # look for missed sources for s in node.sources: if s not in sample_node.sources: result += [(5, "missed_source_value")] break # look for false positive refs for r in sample_node.refs: if r not in node.refs: result += [(6, "false_positive_ref_value")] break # look for missed refs for r in node.refs: if r not in sample_node.refs: result += [(7, "missed_ref_value")] break # if there are no errors, return a success value if not result: result = [(0, "exact_match")] return result