import re from collections import namedtuple import dbt.exceptions def regex(pat): return re.compile(pat, re.DOTALL | re.MULTILINE) class BlockData: """raw plaintext data from the top level of the file.""" def __init__(self, contents): self.block_type_name = '__dbt__data' self.contents = contents self.full_block = contents class BlockTag: def __init__(self, block_type_name, block_name, contents=None, full_block=None, **kw): self.block_type_name = block_type_name self.block_name = block_name self.contents = contents self.full_block = full_block def __str__(self): return 'BlockTag({!r}, {!r})'.format(self.block_type_name, self.block_name) def __repr__(self): return str(self) @property def end_block_type_name(self): return 'end{}'.format(self.block_type_name) def end_pat(self): # we don't want to use string formatting here because jinja uses most # of the string formatting operators in its syntax... pattern = ''.join(( r'(?P((?:\s*\{\%\-|\{\%)\s*', self.end_block_type_name, r'\s*(?:\-\%\}\s*|\%\})))', )) return regex(pattern) Tag = namedtuple('Tag', 'block_type_name block_name start end') _NAME_PATTERN = r'[A-Za-z_][A-Za-z_0-9]*' COMMENT_START_PATTERN = regex(r'(?:(?P(\s*\{\#)))') COMMENT_END_PATTERN = regex(r'(.*?)(\s*\#\})') RAW_START_PATTERN = regex( r'(?:\s*\{\%\-|\{\%)\s*(?P(raw))\s*(?:\-\%\}\s*|\%\})' ) EXPR_START_PATTERN = regex(r'(?P(\{\{\s*))') EXPR_END_PATTERN = regex(r'(?P(\s*\}\}))') BLOCK_START_PATTERN = regex(''.join(( r'(?:\s*\{\%\-|\{\%)\s*', r'(?P({}))'.format(_NAME_PATTERN), # some blocks have a 'block name'. r'(?:\s+(?P({})))?'.format(_NAME_PATTERN), ))) RAW_BLOCK_PATTERN = regex(''.join(( r'(?:\s*\{\%\-|\{\%)\s*raw\s*(?:\-\%\}\s*|\%\})', r'(?:.*?)', r'(?:\s*\{\%\-|\{\%)\s*endraw\s*(?:\-\%\}\s*|\%\})', ))) TAG_CLOSE_PATTERN = regex(r'(?:(?P(\-\%\}\s*|\%\})))') # stolen from jinja's lexer. Note that we've consumed all prefix whitespace by # the time we want to use this. STRING_PATTERN = regex( r"(?P('([^'\\]*(?:\\.[^'\\]*)*)'|" r'"([^"\\]*(?:\\.[^"\\]*)*)"))' ) QUOTE_START_PATTERN = regex(r'''(?P(['"]))''') class TagIterator: def __init__(self, data): self.data = data self.blocks = [] self._parenthesis_stack = [] self.pos = 0 def linepos(self, end=None) -> str: """Given an absolute position in the input data, return a pair of line number + relative position to the start of the line. """ end_val: int = self.pos if end is None else end data = self.data[:end_val] # if not found, rfind returns -1, and -1+1=0, which is perfect! last_line_start = data.rfind('\n') + 1 # it's easy to forget this, but line numbers are 1-indexed line_number = data.count('\n') + 1 return f'{line_number}:{end_val - last_line_start}' def advance(self, new_position): self.pos = new_position def rewind(self, amount=1): self.pos -= amount def _search(self, pattern): return pattern.search(self.data, self.pos) def _match(self, pattern): return pattern.match(self.data, self.pos) def _first_match(self, *patterns, **kwargs): matches = [] for pattern in patterns: # default to 'search', but sometimes we want to 'match'. if kwargs.get('method', 'search') == 'search': match = self._search(pattern) else: match = self._match(pattern) if match: matches.append(match) if not matches: return None # if there are multiple matches, pick the least greedy match # TODO: do I need to account for m.start(), or is this ok? return min(matches, key=lambda m: m.end()) def _expect_match(self, expected_name, *patterns, **kwargs): match = self._first_match(*patterns, **kwargs) if match is None: msg = 'unexpected EOF, expected {}, got "{}"'.format( expected_name, self.data[self.pos:] ) dbt.exceptions.raise_compiler_error(msg) return match def handle_expr(self, match): """Handle an expression. At this point we're at a string like: {{ 1 + 2 }} ^ right here And the match contains "{{ " We expect to find a `}}`, but we might find one in a string before that. Imagine the case of `{{ 2 * "}}" }}`... You're not allowed to have blocks or comments inside an expr so it is pretty straightforward, I hope: only strings can get in the way. """ self.advance(match.end()) while True: match = self._expect_match('}}', EXPR_END_PATTERN, QUOTE_START_PATTERN) if match.groupdict().get('expr_end') is not None: break else: # it's a quote. we haven't advanced for this match yet, so # just slurp up the whole string, no need to rewind. match = self._expect_match('string', STRING_PATTERN) self.advance(match.end()) self.advance(match.end()) def handle_comment(self, match): self.advance(match.end()) match = self._expect_match('#}', COMMENT_END_PATTERN) self.advance(match.end()) def _expect_block_close(self): """Search for the tag close marker. To the right of the type name, there are a few possiblities: - a name (handled by the regex's 'block_name') - any number of: `=`, `(`, `)`, strings, etc (arguments) - nothing followed eventually by a %} So the only characters we actually have to worry about in this context are quote and `%}` - nothing else can hide the %} and be valid jinja. """ while True: end_match = self._expect_match( 'tag close ("%}")', QUOTE_START_PATTERN, TAG_CLOSE_PATTERN ) self.advance(end_match.end()) if end_match.groupdict().get('tag_close') is not None: return # must be a string. Rewind to its start and advance past it. self.rewind() string_match = self._expect_match('string', STRING_PATTERN) self.advance(string_match.end()) def handle_raw(self): # raw blocks are super special, they are a single complete regex match = self._expect_match('{% raw %}...{% endraw %}', RAW_BLOCK_PATTERN) self.advance(match.end()) return match.end() def handle_tag(self, match): """The tag could be one of a few things: {% mytag %} {% mytag x = y %} {% mytag x = "y" %} {% mytag x.y() %} {% mytag foo("a", "b", c="d") %} But the key here is that it's always going to be `{% mytag`! """ groups = match.groupdict() # always a value block_type_name = groups['block_type_name'] # might be None block_name = groups.get('block_name') start_pos = self.pos if block_type_name == 'raw': match = self._expect_match('{% raw %}...{% endraw %}', RAW_BLOCK_PATTERN) self.advance(match.end()) else: self.advance(match.end()) self._expect_block_close() return Tag( block_type_name=block_type_name, block_name=block_name, start=start_pos, end=self.pos ) def find_tags(self): while True: match = self._first_match( BLOCK_START_PATTERN, COMMENT_START_PATTERN, EXPR_START_PATTERN ) if match is None: break self.advance(match.start()) # start = self.pos groups = match.groupdict() comment_start = groups.get('comment_start') expr_start = groups.get('expr_start') block_type_name = groups.get('block_type_name') if comment_start is not None: self.handle_comment(match) elif expr_start is not None: self.handle_expr(match) elif block_type_name is not None: yield self.handle_tag(match) else: raise dbt.exceptions.InternalException( 'Invalid regex match in next_block, expected block start, ' 'expr start, or comment start' ) def __iter__(self): return self.find_tags() duplicate_tags = ( 'Got nested tags: {outer.block_type_name} (started at {outer.start}) did ' 'not have a matching {{% end{outer.block_type_name} %}} before a ' 'subsequent {inner.block_type_name} was found (started at {inner.start})' ) _CONTROL_FLOW_TAGS = { 'if': 'endif', 'for': 'endfor', } _CONTROL_FLOW_END_TAGS = { v: k for k, v in _CONTROL_FLOW_TAGS.items() } class BlockIterator: def __init__(self, data): self.tag_parser = TagIterator(data) self.current = None self.stack = [] self.last_position = 0 @property def current_end(self): if self.current is None: return 0 else: return self.current.end @property def data(self): return self.tag_parser.data def is_current_end(self, tag): return ( tag.block_type_name.startswith('end') and self.current is not None and tag.block_type_name[3:] == self.current.block_type_name ) def find_blocks(self, allowed_blocks=None, collect_raw_data=True): """Find all top-level blocks in the data.""" if allowed_blocks is None: allowed_blocks = {'snapshot', 'macro', 'materialization', 'docs'} for tag in self.tag_parser.find_tags(): if tag.block_type_name in _CONTROL_FLOW_TAGS: self.stack.append(tag.block_type_name) elif tag.block_type_name in _CONTROL_FLOW_END_TAGS: found = None if self.stack: found = self.stack.pop() else: expected = _CONTROL_FLOW_END_TAGS[tag.block_type_name] dbt.exceptions.raise_compiler_error(( 'Got an unexpected control flow end tag, got {} but ' 'never saw a preceeding {} (@ {})' ).format( tag.block_type_name, expected, self.tag_parser.linepos(tag.start) )) expected = _CONTROL_FLOW_TAGS[found] if expected != tag.block_type_name: dbt.exceptions.raise_compiler_error(( 'Got an unexpected control flow end tag, got {} but ' 'expected {} next (@ {})' ).format( tag.block_type_name, expected, self.tag_parser.linepos(tag.start) )) if tag.block_type_name in allowed_blocks: if self.stack: dbt.exceptions.raise_compiler_error(( 'Got a block definition inside control flow at {}. ' 'All dbt block definitions must be at the top level' ).format(self.tag_parser.linepos(tag.start))) if self.current is not None: dbt.exceptions.raise_compiler_error( duplicate_tags.format(outer=self.current, inner=tag) ) if collect_raw_data: raw_data = self.data[self.last_position:tag.start] self.last_position = tag.start if raw_data: yield BlockData(raw_data) self.current = tag elif self.is_current_end(tag): self.last_position = tag.end assert self.current is not None yield BlockTag( block_type_name=self.current.block_type_name, block_name=self.current.block_name, contents=self.data[self.current.end:tag.start], full_block=self.data[self.current.start:tag.end] ) self.current = None if self.current: linecount = self.data[:self.current.end].count('\n') + 1 dbt.exceptions.raise_compiler_error(( 'Reached EOF without finding a close tag for ' '{} (searched from line {})' ).format(self.current.block_type_name, linecount)) if collect_raw_data: raw_data = self.data[self.last_position:] if raw_data: yield BlockData(raw_data) def lex_for_blocks(self, allowed_blocks=None, collect_raw_data=True): return list(self.find_blocks(allowed_blocks=allowed_blocks, collect_raw_data=collect_raw_data))